mindspore 2.4.1__cp311-cp311-win_amd64.whl → 2.5.0__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
- mindspore/Newtonsoft.Json.dll +0 -0
- mindspore/__init__.py +8 -3
- mindspore/_c_dataengine.cp311-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp311-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp311-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +0 -5
- mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
- mindspore/_extends/parse/compile_config.py +64 -0
- mindspore/_extends/parse/deprecated/__init__.py +0 -0
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +375 -0
- mindspore/_extends/parse/parser.py +23 -5
- mindspore/_extends/parse/standard_method.py +123 -27
- mindspore/_extends/pijit/pijit_func_white_list.py +1 -1
- mindspore/amp.py +7 -1
- mindspore/atlprov.dll +0 -0
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/boost/boost_cell_wrapper.py +136 -41
- mindspore/c1.dll +0 -0
- mindspore/c1xx.dll +0 -0
- mindspore/c2.dll +0 -0
- mindspore/common/__init__.py +3 -1
- mindspore/common/_register_for_tensor.py +0 -1
- mindspore/common/_stub_tensor.py +25 -4
- mindspore/common/_tensor_cpp_method.py +17 -0
- mindspore/common/_tensor_docs.py +6132 -0
- mindspore/common/api.py +99 -25
- mindspore/common/dtype.py +34 -34
- mindspore/common/dump.py +2 -1
- mindspore/common/file_system.py +8 -1
- mindspore/common/generator.py +2 -0
- mindspore/common/hook_handle.py +3 -1
- mindspore/common/initializer.py +3 -4
- mindspore/common/lazy_inline.py +8 -2
- mindspore/common/mindir_util.py +10 -2
- mindspore/common/parameter.py +30 -27
- mindspore/common/tensor.py +713 -1337
- mindspore/communication/__init__.py +1 -1
- mindspore/communication/_comm_helper.py +10 -0
- mindspore/communication/comm_func.py +215 -173
- mindspore/communication/management.py +23 -20
- mindspore/context.py +292 -193
- mindspore/dataset/__init__.py +23 -19
- mindspore/dataset/callback/ds_callback.py +2 -1
- mindspore/dataset/core/config.py +84 -3
- mindspore/dataset/engine/cache_admin.py +3 -3
- mindspore/dataset/engine/cache_client.py +5 -4
- mindspore/dataset/engine/datasets.py +192 -149
- mindspore/dataset/engine/datasets_audio.py +14 -0
- mindspore/dataset/engine/datasets_standard_format.py +28 -11
- mindspore/dataset/engine/datasets_text.py +38 -1
- mindspore/dataset/engine/datasets_user_defined.py +125 -65
- mindspore/dataset/engine/datasets_vision.py +81 -8
- mindspore/dataset/engine/iterators.py +281 -63
- mindspore/dataset/engine/obs/util.py +8 -0
- mindspore/dataset/engine/queue.py +40 -0
- mindspore/dataset/engine/samplers.py +26 -2
- mindspore/dataset/engine/serializer_deserializer.py +1 -1
- mindspore/dataset/engine/validators.py +43 -11
- mindspore/dataset/transforms/py_transforms_util.py +17 -0
- mindspore/dataset/transforms/transforms.py +29 -12
- mindspore/dataset/vision/validators.py +1 -2
- mindspore/device_context/__init__.py +21 -0
- mindspore/device_context/ascend/__init__.py +25 -0
- mindspore/device_context/ascend/device.py +72 -0
- mindspore/device_context/ascend/op_debug.py +94 -0
- mindspore/device_context/ascend/op_precision.py +193 -0
- mindspore/device_context/ascend/op_tuning.py +127 -0
- mindspore/device_context/cpu/__init__.py +25 -0
- mindspore/device_context/cpu/device.py +62 -0
- mindspore/device_context/cpu/op_tuning.py +43 -0
- mindspore/device_context/gpu/__init__.py +21 -0
- mindspore/device_context/gpu/device.py +70 -0
- mindspore/device_context/gpu/op_precision.py +67 -0
- mindspore/device_context/gpu/op_tuning.py +175 -0
- mindspore/device_manager.py +134 -0
- mindspore/dnnl.dll +0 -0
- mindspore/dpcmi.dll +0 -0
- mindspore/experimental/llm_boost/__init__.py +3 -2
- mindspore/experimental/llm_boost/ascend_native/__init__.py +22 -0
- mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +211 -0
- mindspore/experimental/llm_boost/ascend_native/llm_boost.py +52 -0
- mindspore/experimental/llm_boost/atb/boost_base.py +239 -64
- mindspore/experimental/llm_boost/atb/llama_boost.py +52 -30
- mindspore/experimental/llm_boost/atb/qwen_boost.py +47 -24
- mindspore/experimental/llm_boost/register.py +1 -0
- mindspore/experimental/optim/adadelta.py +26 -22
- mindspore/experimental/optim/adam.py +3 -0
- mindspore/experimental/optim/lr_scheduler.py +33 -24
- mindspore/experimental/optim/radam.py +33 -30
- mindspore/hal/device.py +28 -0
- mindspore/hal/event.py +17 -0
- mindspore/hal/memory.py +94 -3
- mindspore/hal/stream.py +91 -6
- mindspore/include/api/context.h +1 -2
- mindspore/include/dataset/constants.h +2 -2
- mindspore/jpeg62.dll +0 -0
- mindspore/log.py +12 -0
- mindspore/mindrecord/__init__.py +1 -1
- mindspore/mindrecord/config.py +17 -316
- mindspore/mindrecord/filereader.py +1 -9
- mindspore/mindrecord/filewriter.py +5 -15
- mindspore/mindrecord/mindpage.py +1 -9
- mindspore/mindspore_backend.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/mint/__init__.py +824 -218
- mindspore/mint/distributed/__init__.py +66 -4
- mindspore/mint/distributed/distributed.py +2594 -44
- mindspore/mint/linalg/__init__.py +6 -0
- mindspore/mint/nn/__init__.py +473 -14
- mindspore/mint/nn/functional.py +486 -11
- mindspore/mint/nn/layer/__init__.py +17 -4
- mindspore/mint/nn/layer/_functions.py +330 -0
- mindspore/mint/nn/layer/activation.py +169 -1
- mindspore/mint/nn/layer/basic.py +123 -0
- mindspore/mint/nn/layer/conv.py +727 -0
- mindspore/mint/nn/layer/normalization.py +215 -19
- mindspore/mint/nn/layer/padding.py +797 -0
- mindspore/mint/nn/layer/pooling.py +170 -0
- mindspore/mint/optim/__init__.py +2 -1
- mindspore/mint/optim/adam.py +223 -0
- mindspore/mint/optim/adamw.py +26 -19
- mindspore/mint/special/__init__.py +2 -1
- mindspore/msobj140.dll +0 -0
- mindspore/mspdb140.dll +0 -0
- mindspore/mspdbcore.dll +0 -0
- mindspore/mspdbst.dll +0 -0
- mindspore/mspft140.dll +0 -0
- mindspore/msvcdis140.dll +0 -0
- mindspore/msvcp140_1.dll +0 -0
- mindspore/msvcp140_2.dll +0 -0
- mindspore/msvcp140_atomic_wait.dll +0 -0
- mindspore/msvcp140_codecvt_ids.dll +0 -0
- mindspore/multiprocessing/__init__.py +5 -0
- mindspore/nn/__init__.py +2 -0
- mindspore/nn/cell.py +142 -21
- mindspore/nn/dynamic_lr.py +2 -1
- mindspore/nn/layer/activation.py +6 -6
- mindspore/nn/layer/basic.py +35 -25
- mindspore/nn/layer/channel_shuffle.py +3 -3
- mindspore/nn/layer/conv.py +3 -0
- mindspore/nn/layer/embedding.py +3 -3
- mindspore/nn/layer/normalization.py +8 -7
- mindspore/nn/layer/padding.py +4 -3
- mindspore/nn/layer/pooling.py +55 -23
- mindspore/nn/layer/rnn_cells.py +1 -1
- mindspore/nn/layer/rnns.py +2 -1
- mindspore/nn/layer/timedistributed.py +5 -5
- mindspore/nn/layer/transformer.py +48 -26
- mindspore/nn/learning_rate_schedule.py +5 -3
- mindspore/nn/loss/loss.py +31 -36
- mindspore/nn/optim/ada_grad.py +1 -0
- mindspore/nn/optim/adadelta.py +2 -2
- mindspore/nn/optim/adam.py +1 -1
- mindspore/nn/optim/lars.py +1 -4
- mindspore/nn/optim/optimizer.py +1 -1
- mindspore/nn/optim/rprop.py +2 -2
- mindspore/nn/optim/thor.py +2 -1
- mindspore/nn/utils/__init__.py +22 -0
- mindspore/nn/utils/init.py +73 -0
- mindspore/nn/wrap/cell_wrapper.py +4 -6
- mindspore/nn/wrap/loss_scale.py +3 -4
- mindspore/numpy/array_creations.py +60 -62
- mindspore/numpy/array_ops.py +148 -143
- mindspore/numpy/logic_ops.py +41 -42
- mindspore/numpy/math_ops.py +361 -359
- mindspore/numpy/utils.py +16 -16
- mindspore/numpy/utils_const.py +4 -4
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/__init__.py +2 -1
- mindspore/ops/_grad_experimental/grad_comm_ops.py +107 -8
- mindspore/ops/_grad_experimental/grad_debug_ops.py +6 -1
- mindspore/ops/_grad_experimental/grad_inner_ops.py +9 -0
- mindspore/ops/_grad_experimental/grad_math_ops.py +2 -1
- mindspore/ops/_op_impl/cpu/__init__.py +1 -0
- mindspore/ops/_op_impl/cpu/raise_op.py +28 -0
- mindspore/ops/_vmap/vmap_array_ops.py +20 -19
- mindspore/ops/_vmap/vmap_base.py +0 -2
- mindspore/ops/_vmap/vmap_grad_nn_ops.py +19 -13
- mindspore/ops/_vmap/vmap_math_ops.py +11 -9
- mindspore/ops/_vmap/vmap_nn_ops.py +20 -34
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +149 -12
- mindspore/ops/auto_generate/gen_arg_handler.py +0 -61
- mindspore/ops/auto_generate/gen_extend_func.py +554 -60
- mindspore/ops/auto_generate/gen_ops_def.py +1621 -115
- mindspore/ops/auto_generate/gen_ops_prim.py +8027 -3411
- mindspore/ops/auto_generate/pyboost_inner_prim.py +183 -79
- mindspore/ops/composite/base.py +1 -1
- mindspore/ops/composite/multitype_ops/_compile_utils.py +229 -30
- mindspore/ops/composite/multitype_ops/pow_impl.py +0 -29
- mindspore/ops/function/__init__.py +12 -0
- mindspore/ops/function/array_func.py +561 -159
- mindspore/ops/function/clip_func.py +64 -0
- mindspore/ops/function/debug_func.py +28 -20
- mindspore/ops/function/image_func.py +1 -1
- mindspore/ops/function/linalg_func.py +5 -4
- mindspore/ops/function/math_func.py +1664 -294
- mindspore/ops/function/nn_func.py +988 -317
- mindspore/ops/function/parameter_func.py +3 -56
- mindspore/ops/function/random_func.py +243 -33
- mindspore/ops/function/sparse_unary_func.py +1 -1
- mindspore/ops/functional.py +18 -5
- mindspore/ops/functional_overload.py +897 -0
- mindspore/ops/operations/__init__.py +3 -2
- mindspore/ops/operations/_embedding_cache_ops.py +4 -4
- mindspore/ops/operations/_grad_ops.py +2 -34
- mindspore/ops/operations/_infer_ops.py +2 -1
- mindspore/ops/operations/_inner_ops.py +38 -8
- mindspore/ops/operations/array_ops.py +45 -303
- mindspore/ops/operations/comm_ops.py +23 -17
- mindspore/ops/operations/custom_ops.py +7 -49
- mindspore/ops/operations/debug_ops.py +42 -47
- mindspore/ops/operations/inner_ops.py +6 -4
- mindspore/ops/operations/linalg_ops.py +3 -2
- mindspore/ops/operations/manually_defined/ops_def.py +185 -104
- mindspore/ops/operations/math_ops.py +11 -216
- mindspore/ops/operations/nn_ops.py +153 -310
- mindspore/ops/primitive.py +23 -21
- mindspore/ops/tensor_method.py +1669 -0
- mindspore/ops_generate/aclnn_kernel_register_auto_cc_generator.py +110 -0
- mindspore/ops_generate/add_tensor_docs_generator.py +54 -0
- mindspore/ops_generate/arg_handler.py +0 -61
- mindspore/ops_generate/auto_grad_impl_cc_generator.py +135 -0
- mindspore/ops_generate/auto_grad_reg_cc_generator.py +93 -0
- mindspore/ops_generate/base_generator.py +11 -0
- mindspore/ops_generate/cpp_create_prim_instance_helper_generator.py +108 -0
- mindspore/ops_generate/functional_map_cpp_generator.py +491 -0
- mindspore/ops_generate/functional_overload_py_generator.py +110 -0
- mindspore/ops_generate/functions_cc_generator.py +233 -0
- mindspore/ops_generate/gen_aclnn_implement.py +110 -114
- mindspore/ops_generate/gen_constants.py +157 -3
- mindspore/ops_generate/gen_ops.py +245 -990
- mindspore/ops_generate/gen_pyboost_func.py +97 -998
- mindspore/ops_generate/gen_utils.py +119 -33
- mindspore/ops_generate/lite_ops_cpp_generator.py +155 -0
- mindspore/ops_generate/op_api_proto.py +206 -0
- mindspore/ops_generate/op_def_py_generator.py +131 -0
- mindspore/ops_generate/op_prim_py_generator.py +480 -0
- mindspore/ops_generate/op_proto.py +373 -108
- mindspore/ops_generate/op_template_parser.py +436 -0
- mindspore/ops_generate/ops_def_cc_generator.py +288 -0
- mindspore/ops_generate/ops_def_h_generator.py +74 -0
- mindspore/ops_generate/ops_name_h_generator.py +68 -0
- mindspore/ops_generate/ops_primitive_h_generator.py +81 -0
- mindspore/ops_generate/pyboost_functions_cpp_generator.py +370 -0
- mindspore/ops_generate/pyboost_functions_h_generator.py +68 -0
- mindspore/ops_generate/pyboost_functions_py_generator.py +148 -0
- mindspore/ops_generate/pyboost_grad_function_cpp_generator.py +154 -0
- mindspore/ops_generate/pyboost_inner_prim_generator.py +131 -0
- mindspore/ops_generate/pyboost_native_grad_functions_generator.py +268 -0
- mindspore/ops_generate/pyboost_op_cpp_code_generator.py +851 -0
- mindspore/ops_generate/pyboost_overload_functions_cpp_generator.py +344 -0
- mindspore/ops_generate/pyboost_utils.py +92 -33
- mindspore/ops_generate/template.py +294 -44
- mindspore/ops_generate/tensor_func_reg_cpp_generator.py +422 -0
- mindspore/parallel/__init__.py +3 -3
- mindspore/parallel/_auto_parallel_context.py +44 -34
- mindspore/parallel/_cell_wrapper.py +22 -3
- mindspore/parallel/_parallel_serialization.py +13 -2
- mindspore/parallel/_utils.py +4 -2
- mindspore/parallel/algo_parameter_config.py +1 -1
- mindspore/parallel/checkpoint_transform.py +44 -0
- mindspore/parallel/cluster/process_entity/_api.py +131 -37
- mindspore/parallel/cluster/process_entity/_utils.py +41 -6
- mindspore/parallel/cluster/run.py +20 -3
- mindspore/parallel/parameter_broadcast.py +1 -1
- mindspore/parallel/shard.py +3 -0
- mindspore/parallel/transform_safetensors.py +119 -253
- mindspore/pgodb140.dll +0 -0
- mindspore/pgort140.dll +0 -0
- mindspore/profiler/__init__.py +17 -4
- mindspore/profiler/analysis/__init__.py +0 -0
- mindspore/profiler/analysis/parser/__init__.py +0 -0
- mindspore/profiler/analysis/parser/ascend_cann_parser.py +166 -0
- mindspore/profiler/analysis/parser/base_parser.py +158 -0
- mindspore/profiler/analysis/parser/framework_cann_relation_parser.py +45 -0
- mindspore/profiler/analysis/parser/ms_framework_parser.py +142 -0
- mindspore/profiler/analysis/parser/ms_minddata_parser.py +145 -0
- mindspore/profiler/analysis/parser/timeline_assembly_factory/__init__.py +0 -0
- mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +261 -0
- mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +40 -0
- mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +84 -0
- mindspore/profiler/analysis/parser/timeline_creator/__init__.py +0 -0
- mindspore/profiler/analysis/parser/timeline_creator/base_timeline_creator.py +44 -0
- mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +90 -0
- mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +76 -0
- mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +103 -0
- mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +134 -0
- mindspore/profiler/analysis/parser/timeline_event/__init__.py +0 -0
- mindspore/profiler/analysis/parser/timeline_event/base_event.py +233 -0
- mindspore/profiler/analysis/parser/timeline_event/cpu_op_event.py +47 -0
- mindspore/profiler/analysis/parser/timeline_event/flow_event.py +36 -0
- mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +260 -0
- mindspore/profiler/analysis/parser/timeline_event/msprof_event.py +73 -0
- mindspore/profiler/analysis/parser/timeline_event/scope_layer_event.py +53 -0
- mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +146 -0
- mindspore/profiler/analysis/task_manager.py +131 -0
- mindspore/profiler/analysis/time_converter.py +84 -0
- mindspore/profiler/analysis/viewer/__init__.py +0 -0
- mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +333 -0
- mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +87 -0
- mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +252 -0
- mindspore/profiler/analysis/viewer/ascend_memory_viewer.py +313 -0
- mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +322 -0
- mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +265 -0
- mindspore/profiler/analysis/viewer/ascend_timeline_viewer.py +58 -0
- mindspore/profiler/analysis/viewer/base_viewer.py +26 -0
- mindspore/profiler/analysis/viewer/ms_dataset_viewer.py +97 -0
- mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +581 -0
- mindspore/profiler/analysis/work_flow.py +73 -0
- mindspore/profiler/common/ascend_msprof_exporter.py +138 -0
- mindspore/profiler/common/command_executor.py +90 -0
- mindspore/profiler/common/constant.py +174 -3
- mindspore/profiler/common/file_manager.py +208 -0
- mindspore/profiler/common/log.py +130 -0
- mindspore/profiler/common/msprof_cmd_tool.py +202 -0
- mindspore/profiler/common/path_manager.py +371 -0
- mindspore/profiler/common/process_bar.py +168 -0
- mindspore/profiler/common/process_pool.py +9 -3
- mindspore/profiler/common/profiler_context.py +476 -0
- mindspore/profiler/common/profiler_info.py +304 -0
- mindspore/profiler/common/profiler_output_path.py +284 -0
- mindspore/profiler/common/profiler_parameters.py +210 -0
- mindspore/profiler/common/profiler_path_manager.py +120 -0
- mindspore/profiler/common/record_function.py +76 -0
- mindspore/profiler/common/tlv_decoder.py +76 -0
- mindspore/profiler/common/util.py +75 -2
- mindspore/profiler/dynamic_profiler.py +270 -37
- mindspore/profiler/envprofiler.py +138 -0
- mindspore/profiler/mstx.py +199 -0
- mindspore/profiler/platform/__init__.py +21 -0
- mindspore/profiler/platform/base_profiler.py +40 -0
- mindspore/profiler/platform/cpu_profiler.py +124 -0
- mindspore/profiler/platform/gpu_profiler.py +74 -0
- mindspore/profiler/platform/npu_profiler.py +309 -0
- mindspore/profiler/profiler.py +580 -93
- mindspore/profiler/profiler_action_controller.py +187 -0
- mindspore/profiler/profiler_interface.py +114 -0
- mindspore/profiler/schedule.py +208 -0
- mindspore/rewrite/api/symbol_tree.py +1 -2
- mindspore/run_check/_check_version.py +18 -13
- mindspore/runtime/__init__.py +37 -0
- mindspore/runtime/device.py +27 -0
- mindspore/runtime/event.py +209 -0
- mindspore/runtime/executor.py +148 -0
- mindspore/runtime/memory.py +392 -0
- mindspore/runtime/stream.py +460 -0
- mindspore/runtime/thread_bind_core.py +401 -0
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tbbmalloc.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/__init__.py +2 -2
- mindspore/train/_utils.py +53 -18
- mindspore/train/amp.py +8 -4
- mindspore/train/callback/_checkpoint.py +32 -18
- mindspore/train/callback/_early_stop.py +1 -1
- mindspore/train/callback/_flops_collector.py +105 -69
- mindspore/train/callback/_history.py +1 -1
- mindspore/train/callback/_summary_collector.py +44 -6
- mindspore/train/callback/_tft_register.py +37 -15
- mindspore/train/dataset_helper.py +11 -11
- mindspore/train/metrics/precision.py +4 -5
- mindspore/train/mind_ir_pb2.py +167 -46
- mindspore/train/model.py +13 -14
- mindspore/train/serialization.py +461 -72
- mindspore/train/summary/summary_record.py +1 -2
- mindspore/train/train_thor/model_thor.py +1 -1
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +4 -2
- mindspore/utils/dryrun.py +138 -0
- mindspore/utils/runtime_execution_order_check.py +550 -0
- mindspore/vcmeta.dll +0 -0
- mindspore/vcruntime140.dll +0 -0
- mindspore/vcruntime140_1.dll +0 -0
- mindspore/version.py +1 -1
- {mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/METADATA +3 -4
- {mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/RECORD +391 -265
- {mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/entry_points.txt +1 -1
- mindspore/common/_tensor_overload.py +0 -139
- mindspore/mindspore_np_dtype.dll +0 -0
- mindspore/profiler/envprofiling.py +0 -254
- mindspore/profiler/profiling.py +0 -1926
- {mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/WHEEL +0 -0
- {mindspore-2.4.1.dist-info → mindspore-2.5.0.dist-info}/top_level.txt +0 -0
|
@@ -370,7 +370,7 @@ def get_algo_parameters(attr_key):
|
|
|
370
370
|
Examples:
|
|
371
371
|
>>> import mindspore as ms
|
|
372
372
|
>>> ms.get_algo_parameters("fully_use_devices")
|
|
373
|
-
|
|
373
|
+
False
|
|
374
374
|
"""
|
|
375
375
|
if attr_key not in get_algo_parameters_config_func_map:
|
|
376
376
|
raise ValueError("Get context keyword %s is not recognized!" % attr_key)
|
|
@@ -28,6 +28,7 @@ from mindspore.parallel._parallel_serialization import _rank_list_for_transform_
|
|
|
28
28
|
_extract_layout_map, _extract_src_dst_layout_map, _parameter_not_in_local_stage, _extract_pipeline_stage_num, \
|
|
29
29
|
_merge_protobuf_strategy, _merge_json_strategy, _extract_src_dst_layout_map_by_src
|
|
30
30
|
from mindspore.parallel.transform_safetensors import _transform_safetensors, _collect_safetensor_files
|
|
31
|
+
from mindspore._c_expression import AutoParallelContext
|
|
31
32
|
|
|
32
33
|
__all__ = ["merge_pipeline_strategys", "rank_list_for_transform", "transform_checkpoint_by_rank",
|
|
33
34
|
"transform_checkpoints", "sync_pipeline_shared_parameters", "load_segmented_checkpoints"]
|
|
@@ -648,3 +649,46 @@ def load_segmented_checkpoints(ckpt_file_dir, net=None, strict_load=False, filte
|
|
|
648
649
|
parameter_dict.update(ms.load_checkpoint(checkpoint_file, net, strict_load, filter_prefix, dec_key,
|
|
649
650
|
dec_mode, specify_prefix, choice_func))
|
|
650
651
|
return parameter_dict
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def set_op_strategy_config(mode="SAVE", path=""):
|
|
655
|
+
"""
|
|
656
|
+
Set strategy json configuration when using sharding propagation.
|
|
657
|
+
|
|
658
|
+
.. warning::
|
|
659
|
+
This is an experimental interface, may be changed or canceled in the future;
|
|
660
|
+
This interface currently doesn't support saving or loading strategies using layout.
|
|
661
|
+
|
|
662
|
+
Note:
|
|
663
|
+
- It only works when `parallel_mode=ParallelMode.AUTO_PARALLEL` and `search_mode='sharding_propagation'`.
|
|
664
|
+
- It only supports saving and reloading with the same configuration for the same network. If the network
|
|
665
|
+
or training hyperparameters are modified after using the `SAVE` mode to save the strategies of operator
|
|
666
|
+
to the setting json file, which may lead to the failure of using the `LOAD` mode to load operator
|
|
667
|
+
strategies from json.
|
|
668
|
+
- When performing distributed training, users can first save the strategy using dryrun on a single device
|
|
669
|
+
and then load strategy to perform distributed training.
|
|
670
|
+
|
|
671
|
+
Args:
|
|
672
|
+
mode (str): The parameter for choosing save or load .json file. Default value: ``"SAVE"`` .
|
|
673
|
+
path (str): Path to save or load parallel strategy json, must be an absolute path. Default value: ``""`` .
|
|
674
|
+
|
|
675
|
+
Raises:
|
|
676
|
+
KeyError: When type is not ``"SAVE"`` or ``"LOAD"`` .
|
|
677
|
+
KeyError: When path does not end in ``".json"`` .
|
|
678
|
+
KeyError: When path is not an absolute path.
|
|
679
|
+
"""
|
|
680
|
+
if not os.path.isabs(path):
|
|
681
|
+
raise KeyError("File path must be an absolute path")
|
|
682
|
+
_, file_type = os.path.splitext(path)
|
|
683
|
+
if file_type != ".json":
|
|
684
|
+
raise KeyError("File type must be .json")
|
|
685
|
+
dir_path = os.path.dirname(path)
|
|
686
|
+
if dir_path and not os.path.exists(dir_path):
|
|
687
|
+
os.makedirs(dir_path, mode=0o700, exist_ok=True)
|
|
688
|
+
check_mode_type = ["SAVE", "LOAD"]
|
|
689
|
+
if mode in check_mode_type:
|
|
690
|
+
if AutoParallelContext.get_instance() is None:
|
|
691
|
+
raise ValueError("Get AutoParallelContext instance failed!!!")
|
|
692
|
+
AutoParallelContext.get_instance().set_ops_strategy_json_config(mode, path, "all")
|
|
693
|
+
else:
|
|
694
|
+
raise KeyError("Type must be 'SAVE' or 'LOAD'")
|
|
@@ -16,23 +16,30 @@
|
|
|
16
16
|
import os
|
|
17
17
|
import re
|
|
18
18
|
import sys
|
|
19
|
+
import signal
|
|
19
20
|
import subprocess
|
|
21
|
+
import socket
|
|
20
22
|
import mindspore.log as logger
|
|
21
23
|
from ._utils import _generate_cmd_args_list, _generate_cmd_args_list_with_core, _generate_url,\
|
|
22
|
-
_is_local_ip, _send_scale_num
|
|
24
|
+
_is_local_ip, _convert_addr_to_ip, _send_scale_num, _get_local_ip
|
|
23
25
|
|
|
24
26
|
class _Node:
|
|
25
27
|
"""
|
|
26
28
|
Base class for dynamic networking nodes.
|
|
27
29
|
|
|
28
30
|
"""
|
|
29
|
-
def __init__(self, worker_num, sched_host, sched_port, timeout, args_list, output_file
|
|
31
|
+
def __init__(self, worker_num, sched_host, sched_port, timeout, args_list, output_file, tail_worker_log,
|
|
32
|
+
join, is_simulation):
|
|
30
33
|
self.worker_num = worker_num
|
|
31
34
|
self.sched_host = sched_host
|
|
32
35
|
self.sched_port = sched_port
|
|
33
36
|
self.args_list = args_list
|
|
34
37
|
self.output_file = output_file
|
|
35
38
|
self.timeout = timeout
|
|
39
|
+
self.tail_worker_log = tail_worker_log
|
|
40
|
+
self.join = join
|
|
41
|
+
self.is_simulation = is_simulation
|
|
42
|
+
|
|
36
43
|
|
|
37
44
|
def run(self):
|
|
38
45
|
"""
|
|
@@ -40,9 +47,11 @@ class _Node:
|
|
|
40
47
|
|
|
41
48
|
"""
|
|
42
49
|
os.environ["MS_WORKER_NUM"] = str(self.worker_num)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
50
|
+
# If simulation level is set, environment variables for dynamic networking will not be set and scheduler will not be started.
|
|
51
|
+
if not self.is_simulation:
|
|
52
|
+
os.environ["MS_SCHED_HOST"] = self.sched_host
|
|
53
|
+
os.environ["MS_SCHED_PORT"] = str(self.sched_port)
|
|
54
|
+
os.environ["MS_TOPO_TIMEOUT"] = str(self.timeout)
|
|
46
55
|
|
|
47
56
|
class _MetaServerNode(_Node):
|
|
48
57
|
"""
|
|
@@ -63,8 +72,10 @@ class _ComputeGraphNode(_Node):
|
|
|
63
72
|
"""
|
|
64
73
|
Worker node for dynamic networking. Inherits from the Node class.
|
|
65
74
|
"""
|
|
66
|
-
def __init__(self, worker_num, sched_host, sched_port, timeout, node_id, args_list, output_file
|
|
67
|
-
|
|
75
|
+
def __init__(self, worker_num, sched_host, sched_port, timeout, node_id, args_list, output_file,
|
|
76
|
+
tail_worker_log, join, is_simulation):
|
|
77
|
+
super().__init__(worker_num, sched_host, sched_port, timeout, args_list, output_file,
|
|
78
|
+
tail_worker_log, join, is_simulation)
|
|
68
79
|
self.node_id = node_id
|
|
69
80
|
|
|
70
81
|
|
|
@@ -78,9 +89,36 @@ class _ComputeGraphNode(_Node):
|
|
|
78
89
|
super().run()
|
|
79
90
|
if self.node_id is not None:
|
|
80
91
|
os.environ["MS_NODE_ID"] = str(self.node_id)
|
|
81
|
-
|
|
92
|
+
# If simulation level is set, environment variable 'MS_ROLE' will not be set.
|
|
93
|
+
if not self.is_simulation:
|
|
94
|
+
os.environ["MS_ROLE"] = "MS_WORKER"
|
|
95
|
+
tail_worker_process = None
|
|
96
|
+
is_tail_worker_log = self.enable_tail_worker_log()
|
|
97
|
+
if self.join and not is_tail_worker_log:
|
|
98
|
+
logger.warning(f"The '--tail_worker_log' is:{self.tail_worker_log}, "
|
|
99
|
+
f"which doesn't contain this worker {self.node_id}."
|
|
100
|
+
f" So this worker {self.node_id}'s log will not be output to console. Reset "
|
|
101
|
+
"'--tail_worker_log', if you want to output this worker's log to console.")
|
|
82
102
|
with open(self.output_file, "w") as file_handle:
|
|
83
|
-
|
|
103
|
+
worker_process = subprocess.Popen(self.args_list, preexec_fn=os.setsid, stdout=file_handle,
|
|
104
|
+
stderr=subprocess.STDOUT)
|
|
105
|
+
if self.join and is_tail_worker_log:
|
|
106
|
+
tail_worker_process = self.output_to_console()
|
|
107
|
+
return worker_process, tail_worker_process
|
|
108
|
+
|
|
109
|
+
def output_to_console(self):
|
|
110
|
+
"""
|
|
111
|
+
Output worker log file to console.
|
|
112
|
+
"""
|
|
113
|
+
return subprocess.Popen(['/usr/bin/tail', '-f', self.output_file])
|
|
114
|
+
|
|
115
|
+
def enable_tail_worker_log(self):
|
|
116
|
+
tail_worker_log_list = []
|
|
117
|
+
if self.tail_worker_log != "-1":
|
|
118
|
+
tail_worker_log_list.extend([int(num) for num in self.tail_worker_log.split(',')])
|
|
119
|
+
if self.tail_worker_log != "-1" and self.node_id not in tail_worker_log_list:
|
|
120
|
+
return False
|
|
121
|
+
return True
|
|
84
122
|
|
|
85
123
|
|
|
86
124
|
class _ProcessManager:
|
|
@@ -99,13 +137,14 @@ class _ProcessManager:
|
|
|
99
137
|
"""
|
|
100
138
|
self.msn_process = None
|
|
101
139
|
self.cgn_processes = []
|
|
140
|
+
self.tail_cgn_processes = []
|
|
102
141
|
|
|
103
|
-
|
|
104
|
-
self.is_master = _is_local_ip(args.master_addr)
|
|
105
|
-
|
|
106
|
-
self.master_addr = args.master_addr
|
|
142
|
+
self.master_addr = _convert_addr_to_ip(args.master_addr)
|
|
107
143
|
self.master_port = args.master_port
|
|
108
144
|
|
|
145
|
+
"""`is_master` flags whether the current node is the master node."""
|
|
146
|
+
self.is_master = _is_local_ip(self.master_addr)
|
|
147
|
+
|
|
109
148
|
self.worker_num = args.worker_num
|
|
110
149
|
if self.worker_num <= 0:
|
|
111
150
|
raise ValueError(f"worker_num must be greater than 0, but got {self.worker_num}.")
|
|
@@ -115,6 +154,8 @@ class _ProcessManager:
|
|
|
115
154
|
|
|
116
155
|
self.log_dir = args.log_dir
|
|
117
156
|
self.join = args.join
|
|
157
|
+
self.worker_log_name = args.worker_log_name
|
|
158
|
+
self.tail_worker_log = args.tail_worker_log
|
|
118
159
|
self.cluster_time_out = args.cluster_time_out
|
|
119
160
|
self.bind_core = args.bind_core
|
|
120
161
|
self.rank_table_file = args.rank_table_file
|
|
@@ -123,19 +164,21 @@ class _ProcessManager:
|
|
|
123
164
|
self.sim_rank_id = args.sim_rank_id
|
|
124
165
|
self.is_simulation = (self.sim_level != -1)
|
|
125
166
|
if self.is_simulation:
|
|
126
|
-
# If simulation level is set, reset the worker_num and local_worker_num to 1
|
|
127
|
-
# so that host cluster could be initialized.
|
|
128
|
-
self.worker_num = 1
|
|
129
|
-
self.local_worker_num = 1
|
|
130
167
|
os.environ["MS_SIMULATION_LEVEL"] = str(self.sim_level)
|
|
131
168
|
elif os.getenv("MS_SIMULATION_LEVEL"):
|
|
132
|
-
# If simulation level env is set, load RANK_ID and RANK_SIZE envs.
|
|
133
|
-
self.worker_num = 1
|
|
134
|
-
self.local_worker_num = 1
|
|
135
169
|
self.is_simulation = True
|
|
136
|
-
self.sim_rank_id = os.getenv("RANK_ID", "
|
|
170
|
+
self.sim_rank_id = int(os.getenv("RANK_ID", "-1"))
|
|
137
171
|
if os.getenv("RANK_SIZE"):
|
|
138
172
|
self.exported_rank_size = os.getenv("RANK_SIZE")
|
|
173
|
+
# If sim_rank_id is set, single worker can be started.
|
|
174
|
+
if self.is_simulation and (self.sim_rank_id != -1):
|
|
175
|
+
logger.info(f"Simulation rank id is set to {self.sim_rank_id}, will dryrun a single process.")
|
|
176
|
+
self.local_worker_num = 1
|
|
177
|
+
if self.is_simulation and self.local_worker_num > 128:
|
|
178
|
+
self.local_worker_num = 1
|
|
179
|
+
self.sim_rank_id = 0
|
|
180
|
+
logger.warning(f"In dryrun case, local worker num is set to larger than 128. "
|
|
181
|
+
"To avoid a system clash, local worker num is set to 1.")
|
|
139
182
|
|
|
140
183
|
self.cmd = args.task_script
|
|
141
184
|
self.cmd_args = args.task_script_args
|
|
@@ -173,7 +216,7 @@ class _ProcessManager:
|
|
|
173
216
|
else:
|
|
174
217
|
sys.exit()
|
|
175
218
|
else:
|
|
176
|
-
if self.is_master:
|
|
219
|
+
if self.is_master and not self.is_simulation:
|
|
177
220
|
self.start_scheduler()
|
|
178
221
|
self.start_workers()
|
|
179
222
|
|
|
@@ -190,7 +233,8 @@ class _ProcessManager:
|
|
|
190
233
|
os.environ['RANK_ID'] = str(0)
|
|
191
234
|
msn = _MetaServerNode(self.worker_num, self.master_addr, self.master_port, self.cluster_time_out,
|
|
192
235
|
_generate_cmd_args_list(self.cmd, self.cmd_args),
|
|
193
|
-
os.path.join(self.log_dir, "scheduler.log")
|
|
236
|
+
os.path.join(self.log_dir, "scheduler.log"), self.tail_worker_log, self.join,
|
|
237
|
+
self.is_simulation)
|
|
194
238
|
self.msn_process = msn.run()
|
|
195
239
|
|
|
196
240
|
def start_workers(self):
|
|
@@ -208,9 +252,6 @@ class _ProcessManager:
|
|
|
208
252
|
"You can access 'RANK_ID' environment variable after calling "
|
|
209
253
|
"'mindspore.communication.init()'")
|
|
210
254
|
|
|
211
|
-
if self.is_simulation and self.worker_num != 1:
|
|
212
|
-
raise ValueError(f"Simulation level is set, worker_num must be 1, but got {self.worker_num}.")
|
|
213
|
-
|
|
214
255
|
for i in range(self.local_worker_num):
|
|
215
256
|
os.environ["DEVICE_ID"] = str(i)
|
|
216
257
|
node_id, log_name = self._get_node_id_and_log_path(i)
|
|
@@ -223,9 +264,10 @@ class _ProcessManager:
|
|
|
223
264
|
os.environ["RANK_ID"] = str(node_id)
|
|
224
265
|
logger.warning(f"Start worker process with rank id:{node_id}, log file:{log_name}. "
|
|
225
266
|
"Environment variable [RANK_ID] is exported.")
|
|
226
|
-
if self.is_simulation:
|
|
227
|
-
# Reset RANK_ID env to sim_rank_id.
|
|
267
|
+
if self.is_simulation and (self.sim_rank_id != -1):
|
|
268
|
+
# Reset RANK_ID env to sim_rank_id if sim_rank_id is set.
|
|
228
269
|
os.environ["RANK_ID"] = str(self.sim_rank_id)
|
|
270
|
+
logger.warning(f"In dryrun case, RANK_ID is assigned to {self.sim_rank_id}.")
|
|
229
271
|
|
|
230
272
|
cpu_num = subprocess.getoutput("cat /proc/cpuinfo|grep processor|wc -l")
|
|
231
273
|
if not cpu_num.isdigit():
|
|
@@ -238,9 +280,11 @@ class _ProcessManager:
|
|
|
238
280
|
else:
|
|
239
281
|
cmd = _generate_cmd_args_list(self.cmd, self.cmd_args)
|
|
240
282
|
cgn = _ComputeGraphNode(self.worker_num, self.master_addr, self.master_port, self.cluster_time_out,
|
|
241
|
-
node_id, cmd, log_name)
|
|
242
|
-
process = cgn.run()
|
|
283
|
+
node_id, cmd, log_name, self.tail_worker_log, self.join, self.is_simulation)
|
|
284
|
+
process, tail_process = cgn.run()
|
|
243
285
|
self.cgn_processes.append(process)
|
|
286
|
+
self.tail_cgn_processes.append(tail_process)
|
|
287
|
+
|
|
244
288
|
|
|
245
289
|
def join_processes(self):
|
|
246
290
|
"""
|
|
@@ -248,8 +292,14 @@ class _ProcessManager:
|
|
|
248
292
|
If there's any process does not exit normally, logs will be analyzed
|
|
249
293
|
so that understandable root cause of exception could be returned.
|
|
250
294
|
"""
|
|
295
|
+
def signal_handler(sig, frame):
|
|
296
|
+
logger.warning("msrun process received SIGNIN (Ctrl+C), terminating all workers.")
|
|
297
|
+
self.kill_all_processes()
|
|
298
|
+
sys.exit(0)
|
|
299
|
+
|
|
251
300
|
has_exception = False
|
|
252
301
|
success_cgn_processes = set()
|
|
302
|
+
signal.signal(signal.SIGINT, signal_handler)
|
|
253
303
|
while True:
|
|
254
304
|
# Traversal all workers and kill immediately if any exception happens.
|
|
255
305
|
for p in self.cgn_processes:
|
|
@@ -266,15 +316,14 @@ class _ProcessManager:
|
|
|
266
316
|
|
|
267
317
|
if has_exception:
|
|
268
318
|
logger.warning("There's worker exits with exception, kill all other workers.")
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
p.kill()
|
|
319
|
+
self.kill_worker_processes()
|
|
320
|
+
self.kill_tail_log_processes()
|
|
272
321
|
break
|
|
273
322
|
elif len(success_cgn_processes) == len(self.cgn_processes):
|
|
274
323
|
logger.info("All workers successfully exit!")
|
|
324
|
+
self.kill_tail_log_processes()
|
|
275
325
|
break
|
|
276
326
|
|
|
277
|
-
|
|
278
327
|
if self.msn_process:
|
|
279
328
|
self.msn_process.wait()
|
|
280
329
|
if self.msn_process.returncode != 0:
|
|
@@ -287,6 +336,35 @@ class _ProcessManager:
|
|
|
287
336
|
raise RuntimeError("Distributed job exited with exception. Please check logs in "
|
|
288
337
|
f"directory: {self.log_dir}.")
|
|
289
338
|
|
|
339
|
+
def kill_tail_log_processes(self):
|
|
340
|
+
"""
|
|
341
|
+
Kills all tail worker log processes.
|
|
342
|
+
|
|
343
|
+
"""
|
|
344
|
+
for p_tail in self.tail_cgn_processes:
|
|
345
|
+
if p_tail is not None:
|
|
346
|
+
logger.debug("Tail worker log process:{p_tail.pid} has been killed!")
|
|
347
|
+
p_tail.kill()
|
|
348
|
+
|
|
349
|
+
def kill_worker_processes(self):
|
|
350
|
+
"""
|
|
351
|
+
Kills all worker processes.
|
|
352
|
+
|
|
353
|
+
"""
|
|
354
|
+
for p in self.cgn_processes:
|
|
355
|
+
if p.poll() is None:
|
|
356
|
+
os.killpg(os.getpgid(p.pid), signal.SIGKILL)
|
|
357
|
+
|
|
358
|
+
def kill_all_processes(self):
|
|
359
|
+
"""
|
|
360
|
+
Kills all running processes, including scheduler, worker and tail log.
|
|
361
|
+
|
|
362
|
+
"""
|
|
363
|
+
self.kill_worker_processes()
|
|
364
|
+
self.kill_tail_log_processes()
|
|
365
|
+
if self.msn_process.poll() is None:
|
|
366
|
+
self.msn_process.kill()
|
|
367
|
+
|
|
290
368
|
def stop_processes(self):
|
|
291
369
|
"""
|
|
292
370
|
Stops all running processes.
|
|
@@ -310,26 +388,29 @@ class _ProcessManager:
|
|
|
310
388
|
self.start_scheduler()
|
|
311
389
|
self.start_workers()
|
|
312
390
|
|
|
391
|
+
|
|
313
392
|
def _get_node_id_and_log_path(self, index):
|
|
314
393
|
"""
|
|
315
394
|
Generate node id and log path for corresponding process.
|
|
316
395
|
"""
|
|
396
|
+
formatted_log_name = self.format_worker_log_name()
|
|
317
397
|
if self.local_worker_num > self.worker_num:
|
|
318
398
|
raise ValueError(f"Total worker number is {self.worker_num}, "
|
|
319
399
|
f"but got exceeded local worker number: {self.local_worker_num}.")
|
|
320
400
|
if self.local_worker_num == self.worker_num:
|
|
321
|
-
return index, os.path.join(self.log_dir, "
|
|
401
|
+
return index, os.path.join(self.log_dir, formatted_log_name + "_" + str(index) + ".log")
|
|
322
402
|
|
|
323
403
|
if self.node_rank >= 0:
|
|
324
404
|
# We assume that each node has same process number.
|
|
325
405
|
node_id = self.node_rank * self.local_worker_num + index
|
|
326
|
-
log_name = os.path.join(self.log_dir, "
|
|
406
|
+
log_name = os.path.join(self.log_dir, formatted_log_name + "_" + str(node_id) + ".log")
|
|
327
407
|
else:
|
|
328
408
|
# If node_rank is default value -1, let MindSpore assign rank id.
|
|
329
409
|
node_id = None
|
|
330
|
-
log_name = os.path.join(self.log_dir, "
|
|
410
|
+
log_name = os.path.join(self.log_dir, formatted_log_name + "_" + str(index) + ".log")
|
|
331
411
|
return node_id, log_name
|
|
332
412
|
|
|
413
|
+
|
|
333
414
|
def _analyze_log(self):
|
|
334
415
|
"""
|
|
335
416
|
Analyze exception logs.
|
|
@@ -350,3 +431,16 @@ class _ProcessManager:
|
|
|
350
431
|
logger.error(f"Time out nodes are {time_out_node_ids}")
|
|
351
432
|
|
|
352
433
|
os.system(f"grep -rn -E 'ERROR|CRITICAL|Traceback|Error' -C 5 {self.log_dir}")
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def format_worker_log_name(self):
|
|
437
|
+
"""
|
|
438
|
+
Format worker log files' name.
|
|
439
|
+
"""
|
|
440
|
+
if not self.worker_log_name:
|
|
441
|
+
formatted_worker_log_name = "worker"
|
|
442
|
+
else:
|
|
443
|
+
current_ip = _get_local_ip(self.master_addr)
|
|
444
|
+
formatted_worker_log_name = re.sub(r'\{ip\}', current_ip, self.worker_log_name)
|
|
445
|
+
formatted_worker_log_name = re.sub(r'\{hostname\}', socket.gethostname(), formatted_worker_log_name)
|
|
446
|
+
return formatted_worker_log_name
|
|
@@ -16,8 +16,11 @@
|
|
|
16
16
|
import os
|
|
17
17
|
import json
|
|
18
18
|
import socket
|
|
19
|
+
import ipaddress
|
|
19
20
|
import mindspore.log as logger
|
|
20
21
|
|
|
22
|
+
CURRENT_IP = None
|
|
23
|
+
|
|
21
24
|
def _generate_cmd(cmd, cmd_args, output_name):
|
|
22
25
|
"""
|
|
23
26
|
Generates a command string to execute a Python script in the background, r
|
|
@@ -67,6 +70,24 @@ def _generate_url(addr, port):
|
|
|
67
70
|
return url
|
|
68
71
|
|
|
69
72
|
|
|
73
|
+
def _get_local_ip(ip_address):
|
|
74
|
+
"""
|
|
75
|
+
Get current IP address.
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
global CURRENT_IP
|
|
79
|
+
if CURRENT_IP is None:
|
|
80
|
+
try:
|
|
81
|
+
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
82
|
+
s.connect((ip_address, 0))
|
|
83
|
+
CURRENT_IP = s.getsockname()[0]
|
|
84
|
+
s.close()
|
|
85
|
+
except Exception as e:
|
|
86
|
+
raise RuntimeError(f"Get local ip failed: {e}. Please check whether an accessible address "
|
|
87
|
+
"is input by '--master_address'.")
|
|
88
|
+
return CURRENT_IP
|
|
89
|
+
|
|
90
|
+
|
|
70
91
|
def _is_local_ip(ip_address):
|
|
71
92
|
"""
|
|
72
93
|
Check if the current input IP address is a local IP address.
|
|
@@ -75,13 +96,8 @@ def _is_local_ip(ip_address):
|
|
|
75
96
|
p = os.popen("ip -j addr")
|
|
76
97
|
addr_info_str = p.read()
|
|
77
98
|
p.close()
|
|
99
|
+
current_ip = _get_local_ip(ip_address)
|
|
78
100
|
if not addr_info_str:
|
|
79
|
-
# This means this host has no "ip -j addr" command.
|
|
80
|
-
# We use socket module to get local ip address.
|
|
81
|
-
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
82
|
-
s.connect((ip_address, 0))
|
|
83
|
-
current_ip = s.getsockname()[0]
|
|
84
|
-
s.close()
|
|
85
101
|
return current_ip == ip_address
|
|
86
102
|
|
|
87
103
|
addr_infos = json.loads(addr_info_str)
|
|
@@ -93,6 +109,25 @@ def _is_local_ip(ip_address):
|
|
|
93
109
|
return False
|
|
94
110
|
|
|
95
111
|
|
|
112
|
+
def _convert_addr_to_ip(master_addr):
|
|
113
|
+
"""
|
|
114
|
+
Check whether the input parameter 'master_addr' is IPv4. If a hostname is inserted, it will be converted
|
|
115
|
+
to IP and then set as master host's IP.
|
|
116
|
+
|
|
117
|
+
"""
|
|
118
|
+
try:
|
|
119
|
+
ipaddress.IPv4Address(master_addr)
|
|
120
|
+
return master_addr
|
|
121
|
+
except ipaddress.AddressValueError:
|
|
122
|
+
try:
|
|
123
|
+
ip_address = socket.gethostbyname(master_addr)
|
|
124
|
+
logger.info(f"Convert input host name:{master_addr} to ip address:{ip_address}.")
|
|
125
|
+
return ip_address
|
|
126
|
+
except socket.gaierror as e:
|
|
127
|
+
raise RuntimeError(f"DNS resolution failed: {e}. Please check whether a correct host name "
|
|
128
|
+
"is input by '--master_address'.")
|
|
129
|
+
|
|
130
|
+
|
|
96
131
|
def _send_scale_num(url, scale_num):
|
|
97
132
|
"""
|
|
98
133
|
Send an HTTP request to a specified URL, informing scale_num.
|
|
@@ -37,8 +37,8 @@ def get_args():
|
|
|
37
37
|
parser.add_argument(
|
|
38
38
|
"--master_addr",
|
|
39
39
|
default="127.0.0.1", type=str,
|
|
40
|
-
help="specifies the IP address of the scheduler and its data type is string."
|
|
41
|
-
" Allowed values: valid IP addresses."
|
|
40
|
+
help="specifies the IP address or the host name of the scheduler and its data type is string."
|
|
41
|
+
" Allowed values: valid IP addresses or valid host name."
|
|
42
42
|
)
|
|
43
43
|
parser.add_argument(
|
|
44
44
|
"--master_port", default=8118, type=int,
|
|
@@ -91,7 +91,7 @@ def get_args():
|
|
|
91
91
|
)
|
|
92
92
|
parser.add_argument(
|
|
93
93
|
"--sim_rank_id",
|
|
94
|
-
default
|
|
94
|
+
default=-1,
|
|
95
95
|
type=int,
|
|
96
96
|
help="specifies simulation process's rank id. Only one process is spawned in simulation scenario."
|
|
97
97
|
)
|
|
@@ -102,6 +102,23 @@ def get_args():
|
|
|
102
102
|
help="specifies rank table file path. This path is not used to initialize distributed job in "
|
|
103
103
|
"'rank table file manner' but to help support other features."
|
|
104
104
|
)
|
|
105
|
+
parser.add_argument(
|
|
106
|
+
"--worker_log_name",
|
|
107
|
+
default="",
|
|
108
|
+
type=str,
|
|
109
|
+
help="Specifies the worker log file name as a string for current node; the default is worker_[rankid]. "
|
|
110
|
+
"Support configuring the current IP address and host name by using {ip} and {hostname} respectively. "
|
|
111
|
+
"e.g. --worker_log_name=worker_{ip}_{hostname}_test, worker [rankid] log name for current node "
|
|
112
|
+
"will be worker_[real IP address]_[real host name]_test_[rankid]."
|
|
113
|
+
)
|
|
114
|
+
parser.add_argument(
|
|
115
|
+
"--tail_worker_log",
|
|
116
|
+
default="-1",
|
|
117
|
+
type=str,
|
|
118
|
+
help="Only tail worker log to console when '--join=True' and the configured value should be within "
|
|
119
|
+
"[0, local_worker_num], otherwise worker log will not be tail. All worker logs will be tail by "
|
|
120
|
+
"default. Support tail the specified worker log (e.g. --tail_log=0 tail the worker 0 log to console)."
|
|
121
|
+
)
|
|
105
122
|
parser.add_argument(
|
|
106
123
|
"task_script",
|
|
107
124
|
type=str,
|
|
@@ -56,7 +56,7 @@ def parameter_broadcast(net, layout, cur_rank=0, initial_rank=0):
|
|
|
56
56
|
>>> from mindspore.parallel.parameter_broadcast import parameter_broadcast
|
|
57
57
|
>>> from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
|
58
58
|
>>> ms.set_context(mode=ms.GRAPH_MODE)
|
|
59
|
-
>>> ms.
|
|
59
|
+
>>> ms.runtime.set_memory(max_size="28GB")
|
|
60
60
|
>>> ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL)
|
|
61
61
|
>>> init()
|
|
62
62
|
>>> ms.set_seed(1)
|
mindspore/parallel/shard.py
CHANGED
|
@@ -24,6 +24,9 @@ class Layout:
|
|
|
24
24
|
"""
|
|
25
25
|
Parallel layout describes the detailed sharding information.
|
|
26
26
|
|
|
27
|
+
For more detailed information, refer to the file `Higher-order Operator-level Parallelism
|
|
28
|
+
<https://www.mindspore.cn/docs/en/master/model_train/parallel/advanced_operator_parallel.html>`_.
|
|
29
|
+
|
|
27
30
|
Note:
|
|
28
31
|
- It is valid only in semi auto parallel or auto parallel mode.
|
|
29
32
|
- The multiplication result of the `device_matrix` must be equal to the device count in a pipeline stage.
|