mindspore 2.6.0__cp39-cp39-win_amd64.whl → 2.7.0rc1__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/__init__.py +1 -1
- mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +40 -9
- mindspore/{_deprecated → _extends/optimize}/__init__.py +9 -3
- mindspore/_extends/optimize/cell_utils.py +96 -0
- mindspore/_extends/parse/__init__.py +2 -2
- mindspore/_extends/parse/compile_config.py +44 -22
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +1 -1
- mindspore/_extends/parse/parser.py +36 -61
- mindspore/_extends/parse/resources.py +39 -0
- mindspore/_extends/parse/standard_method.py +32 -13
- mindspore/_extends/parse/trope.py +8 -1
- mindspore/_extends/pijit/__init__.py +1 -2
- mindspore/amp.py +4 -4
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/boost/adasum.py +1 -1
- mindspore/boost/boost_cell_wrapper.py +4 -4
- mindspore/common/__init__.py +27 -2
- mindspore/common/_grad_function.py +2 -1
- mindspore/common/_pijit_context.py +28 -7
- mindspore/common/_stub_tensor.py +1 -209
- mindspore/common/_tensor_cpp_method.py +1 -1
- mindspore/common/_tensor_docs.py +76 -15
- mindspore/common/api.py +193 -112
- mindspore/common/dtype.py +21 -11
- mindspore/common/dump.py +10 -15
- mindspore/common/generator.py +2 -3
- mindspore/common/hook_handle.py +11 -2
- mindspore/common/jit_config.py +1 -1
- mindspore/common/jit_trace.py +84 -105
- mindspore/common/parameter.py +26 -12
- mindspore/common/recompute.py +3 -3
- mindspore/common/sparse_tensor.py +0 -3
- mindspore/common/symbol.py +0 -1
- mindspore/common/tensor.py +48 -83
- mindspore/communication/_comm_helper.py +46 -4
- mindspore/communication/management.py +79 -7
- mindspore/context.py +38 -23
- mindspore/dataset/core/config.py +3 -3
- mindspore/dataset/engine/datasets.py +20 -7
- mindspore/dataset/engine/datasets_user_defined.py +32 -2
- mindspore/dataset/engine/iterators.py +2 -2
- mindspore/dataset/engine/obs/config_loader.py +2 -2
- mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +8 -0
- mindspore/dataset/transforms/py_transforms.py +7 -3
- mindspore/dataset/transforms/transforms.py +7 -3
- mindspore/dataset/vision/validators.py +1 -0
- mindspore/device_context/ascend/device.py +1 -1
- mindspore/device_context/gpu/__init__.py +2 -2
- mindspore/device_context/gpu/device.py +1 -1
- mindspore/device_context/gpu/op_precision.py +4 -2
- mindspore/device_context/gpu/op_tuning.py +6 -3
- mindspore/device_manager.py +16 -9
- mindspore/dnnl.dll +0 -0
- mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +3 -5
- mindspore/experimental/llm_boost/atb/boost_base.py +2 -3
- mindspore/experimental/optim/adadelta.py +13 -20
- mindspore/experimental/optim/adagrad.py +15 -22
- mindspore/experimental/optim/adam.py +17 -24
- mindspore/experimental/optim/adamax.py +14 -22
- mindspore/experimental/optim/adamw.py +28 -34
- mindspore/experimental/optim/asgd.py +15 -25
- mindspore/experimental/optim/lr_scheduler.py +27 -45
- mindspore/experimental/optim/nadam.py +14 -24
- mindspore/experimental/optim/optimizer.py +13 -23
- mindspore/experimental/optim/radam.py +18 -24
- mindspore/experimental/optim/rmsprop.py +14 -25
- mindspore/experimental/optim/rprop.py +15 -26
- mindspore/experimental/optim/sgd.py +9 -19
- mindspore/hal/__init__.py +4 -4
- mindspore/hal/contiguous_tensors_handle.py +2 -2
- mindspore/hal/memory.py +1 -0
- mindspore/include/api/cell.h +37 -1
- mindspore/include/api/delegate.h +10 -0
- mindspore/include/api/model.h +3 -0
- mindspore/include/api/types.h +2 -2
- mindspore/include/c_api/model_c.h +0 -58
- mindspore/include/c_api/tensor_c.h +0 -26
- mindspore/include/dataset/vision_ascend.h +1 -1
- mindspore/jpeg62.dll +0 -0
- mindspore/mindrecord/tools/cifar10.py +60 -11
- mindspore/mindrecord/tools/cifar10_to_mr.py +5 -0
- mindspore/mindspore_backend_common.dll +0 -0
- mindspore/mindspore_backend_manager.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_cpu_res_manager.dll +0 -0
- mindspore/mindspore_dump.dll +0 -0
- mindspore/mindspore_frontend.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_memory_pool.dll +0 -0
- mindspore/mindspore_ms_backend.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/mindspore_ops_host.dll +0 -0
- mindspore/mindspore_ops_kernel_common.dll +0 -0
- mindspore/mindspore_profiler.dll +0 -0
- mindspore/mindspore_pyboost.dll +0 -0
- mindspore/mindspore_pynative.dll +0 -0
- mindspore/mindspore_res_manager.dll +0 -0
- mindspore/mindspore_runtime_pipeline.dll +0 -0
- mindspore/mint/__init__.py +4 -44
- mindspore/mint/distributed/__init__.py +1 -0
- mindspore/mint/distributed/distributed.py +208 -5
- mindspore/mint/nn/__init__.py +1 -1
- mindspore/mint/nn/functional.py +53 -6
- mindspore/mint/nn/layer/_functions.py +164 -294
- mindspore/mint/nn/layer/activation.py +8 -6
- mindspore/mint/nn/layer/conv.py +122 -98
- mindspore/mint/nn/layer/normalization.py +8 -22
- mindspore/mint/optim/adam.py +19 -18
- mindspore/mint/optim/adamw.py +14 -8
- mindspore/mint/optim/sgd.py +5 -5
- mindspore/nn/cell.py +325 -499
- mindspore/nn/grad/cell_grad.py +11 -12
- mindspore/nn/layer/activation.py +32 -34
- mindspore/nn/layer/basic.py +67 -64
- mindspore/nn/layer/channel_shuffle.py +4 -4
- mindspore/nn/layer/combined.py +4 -2
- mindspore/nn/layer/conv.py +86 -85
- mindspore/nn/layer/dense.py +9 -7
- mindspore/nn/layer/embedding.py +50 -52
- mindspore/nn/layer/image.py +37 -39
- mindspore/nn/layer/math.py +111 -112
- mindspore/nn/layer/normalization.py +56 -44
- mindspore/nn/layer/pooling.py +58 -63
- mindspore/nn/layer/rnn_cells.py +33 -33
- mindspore/nn/layer/rnns.py +56 -56
- mindspore/nn/layer/thor_layer.py +74 -73
- mindspore/nn/layer/transformer.py +11 -1
- mindspore/nn/learning_rate_schedule.py +20 -20
- mindspore/nn/loss/loss.py +79 -81
- mindspore/nn/optim/adam.py +1 -1
- mindspore/nn/optim/adasum.py +2 -2
- mindspore/nn/optim/optimizer.py +1 -1
- mindspore/nn/optim/thor.py +2 -2
- mindspore/nn/probability/distribution/exponential.py +2 -1
- mindspore/nn/probability/distribution/poisson.py +2 -1
- mindspore/nn/sparse/sparse.py +3 -3
- mindspore/nn/wrap/cell_wrapper.py +34 -37
- mindspore/nn/wrap/grad_reducer.py +37 -37
- mindspore/nn/wrap/loss_scale.py +72 -74
- mindspore/numpy/array_creations.py +5 -5
- mindspore/numpy/fft.py +1 -1
- mindspore/numpy/math_ops.py +1 -1
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/_grad_experimental/grad_comm_ops.py +51 -13
- mindspore/ops/_grad_experimental/grad_debug_ops.py +14 -0
- mindspore/ops/_vmap/vmap_array_ops.py +6 -13
- mindspore/ops/_vmap/vmap_nn_ops.py +8 -16
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +17 -8
- mindspore/ops/auto_generate/gen_extend_func.py +1 -51
- mindspore/ops/auto_generate/gen_ops_def.py +463 -257
- mindspore/ops/auto_generate/gen_ops_prim.py +1127 -885
- mindspore/ops/auto_generate/pyboost_inner_prim.py +31 -1
- mindspore/ops/composite/__init__.py +10 -0
- mindspore/ops/composite/base.py +8 -4
- mindspore/ops/composite/multitype_ops/__init__.py +12 -1
- mindspore/ops/composite/multitype_ops/_compile_utils.py +132 -108
- mindspore/ops/composite/multitype_ops/add_impl.py +70 -2
- mindspore/ops/composite/multitype_ops/div_impl.py +49 -0
- mindspore/ops/composite/multitype_ops/floordiv_impl.py +29 -0
- mindspore/ops/composite/multitype_ops/getitem_impl.py +11 -0
- mindspore/ops/composite/multitype_ops/mod_impl.py +5 -3
- mindspore/ops/composite/multitype_ops/mul_impl.py +49 -0
- mindspore/ops/composite/multitype_ops/setitem_impl.py +57 -0
- mindspore/ops/composite/multitype_ops/sub_impl.py +34 -0
- mindspore/ops/composite/multitype_ops/zeros_like_impl.py +14 -0
- mindspore/ops/function/__init__.py +3 -1
- mindspore/ops/function/_add_attr_func.py +11 -6
- mindspore/ops/function/array_func.py +7 -94
- mindspore/ops/function/debug_func.py +4 -3
- mindspore/ops/function/grad/grad_func.py +1 -1
- mindspore/ops/function/math_func.py +21 -367
- mindspore/ops/function/nn_func.py +26 -41
- mindspore/ops/function/other_func.py +4 -1
- mindspore/ops/function/random_func.py +31 -4
- mindspore/ops/functional.py +0 -2
- mindspore/ops/functional_overload.py +463 -6
- mindspore/ops/op_info_register.py +21 -0
- mindspore/ops/operations/__init__.py +5 -2
- mindspore/ops/operations/_custom_ops_utils.py +675 -8
- mindspore/ops/operations/_inner_ops.py +3 -6
- mindspore/ops/operations/_sequence_ops.py +1 -1
- mindspore/ops/operations/comm_ops.py +185 -26
- mindspore/ops/operations/custom_ops.py +235 -172
- mindspore/ops/operations/debug_ops.py +55 -4
- mindspore/ops/operations/image_ops.py +13 -13
- mindspore/ops/operations/manually_defined/ops_def.py +15 -16
- mindspore/ops/operations/math_ops.py +3 -4
- mindspore/ops/operations/nn_ops.py +5 -6
- mindspore/ops/primitive.py +6 -10
- mindspore/ops/tensor_method.py +36 -4
- mindspore/ops_generate/api/cpp_create_prim_instance_helper_generator.py +1 -1
- mindspore/ops_generate/api/functional_map_cpp_generator.py +10 -9
- mindspore/ops_generate/api/functions_cc_generator.py +58 -10
- mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +1 -1
- mindspore/ops_generate/common/base_generator.py +14 -0
- mindspore/ops_generate/common/gen_constants.py +7 -2
- mindspore/ops_generate/common/gen_utils.py +0 -19
- mindspore/ops_generate/common/op_proto.py +11 -4
- mindspore/ops_generate/common/template.py +88 -11
- mindspore/ops_generate/gen_ops.py +1 -1
- mindspore/ops_generate/op_def/lite_ops_cpp_generator.py +4 -4
- mindspore/ops_generate/op_def/ops_name_h_generator.py +0 -3
- mindspore/ops_generate/op_def/ops_primitive_h_generator.py +0 -4
- mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -2
- mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +49 -8
- mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +2 -2
- mindspore/ops_generate/pyboost/gen_pyboost_func.py +31 -0
- mindspore/ops_generate/pyboost/op_template_parser.py +98 -72
- mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +70 -273
- mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +14 -6
- mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +316 -0
- mindspore/ops_generate/pyboost/pyboost_functions_py_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +5 -3
- mindspore/ops_generate/pyboost/pyboost_inner_prim_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_internal_functions_cpp_generator.py +76 -0
- mindspore/ops_generate/pyboost/pyboost_internal_functions_h_generator.py +76 -0
- mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +125 -0
- mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +4 -3
- mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +348 -61
- mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_utils.py +118 -9
- mindspore/ops_generate/tensor_py_cc_generator.py +1 -24
- mindspore/parallel/_auto_parallel_context.py +4 -2
- mindspore/parallel/_cell_wrapper.py +106 -40
- mindspore/parallel/_parallel_serialization.py +1 -1
- mindspore/parallel/_ps_context.py +4 -6
- mindspore/parallel/_tensor.py +167 -12
- mindspore/parallel/_transformer/moe.py +1 -1
- mindspore/parallel/_transformer/transformer.py +13 -8
- mindspore/parallel/auto_parallel.py +12 -5
- mindspore/parallel/checkpoint_convert.py +3 -3
- mindspore/parallel/checkpoint_transform.py +3 -1
- mindspore/parallel/cluster/process_entity/_api.py +84 -48
- mindspore/parallel/cluster/process_entity/_utils.py +95 -7
- mindspore/parallel/cluster/run.py +43 -4
- mindspore/parallel/function/__init__.py +8 -1
- mindspore/parallel/function/reshard_func.py +1 -1
- mindspore/parallel/nn/__init__.py +15 -2
- mindspore/parallel/nn/parallel_cell_wrapper.py +9 -10
- mindspore/parallel/nn/parallel_grad_reducer.py +7 -6
- mindspore/parallel/shard.py +2 -2
- mindspore/parallel/transform_safetensors.py +462 -174
- mindspore/profiler/__init__.py +2 -1
- mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +7 -7
- mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +3 -0
- mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +3 -0
- mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +4 -4
- mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +4 -1
- mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +2 -1
- mindspore/profiler/analysis/task_manager.py +1 -1
- mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +5 -1
- mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +2 -1
- mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +42 -22
- mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +3 -2
- mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +9 -5
- mindspore/profiler/analysis/viewer/ms_operator_details_viewer.py +132 -0
- mindspore/profiler/common/constant.py +16 -0
- mindspore/profiler/common/profiler_context.py +25 -27
- mindspore/profiler/common/profiler_info.py +0 -16
- mindspore/profiler/common/profiler_op_analyse.py +235 -0
- mindspore/profiler/common/profiler_output_path.py +23 -8
- mindspore/profiler/common/profiler_parameters.py +128 -35
- mindspore/profiler/dynamic_profile/__init__.py +0 -0
- mindspore/profiler/dynamic_profile/dynamic_monitor_proxy.py +39 -0
- mindspore/profiler/dynamic_profile/dynamic_profiler_config_context.py +666 -0
- mindspore/profiler/dynamic_profile/dynamic_profiler_utils.py +62 -0
- mindspore/profiler/dynamic_profiler.py +305 -314
- mindspore/profiler/envprofiler.py +12 -7
- mindspore/profiler/experimental_config.py +96 -6
- mindspore/profiler/mstx.py +33 -12
- mindspore/profiler/platform/__init__.py +2 -3
- mindspore/profiler/platform/npu_profiler.py +29 -19
- mindspore/profiler/profiler.py +35 -19
- mindspore/profiler/profiler_action_controller.py +64 -76
- mindspore/profiler/schedule.py +10 -4
- mindspore/rewrite/common/config.py +1 -0
- mindspore/rewrite/common/namer.py +1 -0
- mindspore/rewrite/common/namespace.py +1 -0
- mindspore/rewrite/node/node.py +31 -11
- mindspore/rewrite/parsers/assign_parser.py +1 -1
- mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
- mindspore/run_check/_check_version.py +7 -10
- mindspore/runtime/__init__.py +5 -5
- mindspore/runtime/event.py +10 -4
- mindspore/runtime/executor.py +60 -45
- mindspore/runtime/memory.py +21 -30
- mindspore/runtime/thread_bind_core.py +298 -164
- mindspore/safeguard/rewrite_obfuscation.py +12 -13
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/_utils.py +6 -2
- mindspore/train/amp.py +43 -20
- mindspore/train/callback/__init__.py +5 -5
- mindspore/train/callback/_checkpoint.py +3 -6
- mindspore/train/callback/_flops_collector.py +1 -1
- mindspore/train/callback/_landscape.py +0 -1
- mindspore/train/callback/_train_fault_tolerance.py +71 -13
- mindspore/train/data_sink.py +11 -2
- mindspore/train/dataset_helper.py +9 -0
- mindspore/train/model.py +51 -33
- mindspore/train/serialization.py +133 -111
- mindspore/train/summary/summary_record.py +13 -2
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +3 -2
- mindspore/utils/dryrun.py +0 -6
- mindspore/utils/runtime_execution_order_check.py +162 -78
- mindspore/utils/sdc_detect.py +68 -0
- mindspore/utils/utils.py +6 -9
- mindspore/version.py +1 -1
- {mindspore-2.6.0.dist-info → mindspore-2.7.0rc1.dist-info}/METADATA +5 -4
- {mindspore-2.6.0.dist-info → mindspore-2.7.0rc1.dist-info}/RECORD +329 -367
- mindspore/_deprecated/jit.py +0 -198
- mindspore/experimental/es/__init__.py +0 -22
- mindspore/experimental/es/embedding_service.py +0 -891
- mindspore/experimental/es/embedding_service_layer.py +0 -581
- mindspore/profiler/parser/__init__.py +0 -14
- mindspore/profiler/parser/aicpu_data_parser.py +0 -272
- mindspore/profiler/parser/ascend_analysis/__init__.py +0 -14
- mindspore/profiler/parser/ascend_analysis/constant.py +0 -71
- mindspore/profiler/parser/ascend_analysis/file_manager.py +0 -180
- mindspore/profiler/parser/ascend_analysis/function_event.py +0 -185
- mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +0 -136
- mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +0 -131
- mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +0 -104
- mindspore/profiler/parser/ascend_analysis/path_manager.py +0 -313
- mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +0 -123
- mindspore/profiler/parser/ascend_analysis/tlv_decoder.py +0 -86
- mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +0 -75
- mindspore/profiler/parser/ascend_cluster_generator.py +0 -116
- mindspore/profiler/parser/ascend_communicate_generator.py +0 -314
- mindspore/profiler/parser/ascend_flops_generator.py +0 -116
- mindspore/profiler/parser/ascend_fpbp_generator.py +0 -82
- mindspore/profiler/parser/ascend_hccl_generator.py +0 -271
- mindspore/profiler/parser/ascend_integrate_generator.py +0 -42
- mindspore/profiler/parser/ascend_memory_generator.py +0 -185
- mindspore/profiler/parser/ascend_msprof_exporter.py +0 -282
- mindspore/profiler/parser/ascend_msprof_generator.py +0 -187
- mindspore/profiler/parser/ascend_op_generator.py +0 -334
- mindspore/profiler/parser/ascend_steptrace_generator.py +0 -94
- mindspore/profiler/parser/ascend_timeline_generator.py +0 -545
- mindspore/profiler/parser/base_timeline_generator.py +0 -483
- mindspore/profiler/parser/container.py +0 -229
- mindspore/profiler/parser/cpu_gpu_timeline_generator.py +0 -697
- mindspore/profiler/parser/flops_parser.py +0 -531
- mindspore/profiler/parser/framework_enum.py +0 -111
- mindspore/profiler/parser/framework_parser.py +0 -464
- mindspore/profiler/parser/framework_struct.py +0 -61
- mindspore/profiler/parser/gpu_analysis/__init__.py +0 -14
- mindspore/profiler/parser/gpu_analysis/function_event.py +0 -44
- mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +0 -89
- mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +0 -72
- mindspore/profiler/parser/hccl_parser.py +0 -573
- mindspore/profiler/parser/hwts_log_parser.py +0 -122
- mindspore/profiler/parser/integrator.py +0 -526
- mindspore/profiler/parser/memory_usage_parser.py +0 -277
- mindspore/profiler/parser/minddata_analyzer.py +0 -800
- mindspore/profiler/parser/minddata_parser.py +0 -186
- mindspore/profiler/parser/minddata_pipeline_parser.py +0 -299
- mindspore/profiler/parser/op_intermediate_parser.py +0 -149
- mindspore/profiler/parser/optime_parser.py +0 -250
- mindspore/profiler/parser/profiler_info.py +0 -213
- mindspore/profiler/parser/step_trace_parser.py +0 -666
- {mindspore-2.6.0.dist-info → mindspore-2.7.0rc1.dist-info}/WHEEL +0 -0
- {mindspore-2.6.0.dist-info → mindspore-2.7.0rc1.dist-info}/entry_points.txt +0 -0
- {mindspore-2.6.0.dist-info → mindspore-2.7.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -81,7 +81,7 @@ def _transform_target_modules(target_modules):
|
|
|
81
81
|
obfuscate_layers = target_modules[2].split(':')
|
|
82
82
|
if obfuscate_layers[1] != 'all':
|
|
83
83
|
max_layers = int(obfuscate_layers[1])
|
|
84
|
-
layers =
|
|
84
|
+
layers = list(range(0, max_layers))
|
|
85
85
|
path_new = path.replace("blocks", "blocks/${layer}")
|
|
86
86
|
network_obf_template['insert_ops'][0]['input_y'] = "obf_metadata_${layer}"
|
|
87
87
|
weight_obf_template['weight_obf_ops'][0]['input_y'] = "obf_metadata_${layer}"
|
|
@@ -95,8 +95,8 @@ def _transform_target_modules(target_modules):
|
|
|
95
95
|
obf_config['obf_metadata_config'].append(obf_medatadata)
|
|
96
96
|
|
|
97
97
|
for name in target_list:
|
|
98
|
-
target_weight =
|
|
99
|
-
target_bias =
|
|
98
|
+
target_weight = '/'.join([path_new, name, 'weight'])
|
|
99
|
+
target_bias = '/'.join([path_new, name, 'bias'])
|
|
100
100
|
weight_obf = weight_obf_template.copy()
|
|
101
101
|
weight_obf['target'] = target_weight
|
|
102
102
|
bias_obf = weight_obf_template.copy()
|
|
@@ -185,7 +185,7 @@ def obfuscate_ckpt(network, ckpt_files, target_modules=None, obf_config=None, sa
|
|
|
185
185
|
def _gen_obf_metadata(config):
|
|
186
186
|
name = config.get('name')
|
|
187
187
|
if name is None:
|
|
188
|
-
return
|
|
188
|
+
return
|
|
189
189
|
save_metadata = config.get('save_metadata', False)
|
|
190
190
|
metadata_op_name = config.get('metadata_op')
|
|
191
191
|
layers = config.get('layers')
|
|
@@ -213,7 +213,6 @@ def obfuscate_ckpt(network, ckpt_files, target_modules=None, obf_config=None, sa
|
|
|
213
213
|
saved_obf_tensor = metadata_op(saved_obf_tensor)
|
|
214
214
|
if saved_obf_tensor is not None:
|
|
215
215
|
saved_metadata[obf_name] = saved_obf_tensor.asnumpy()
|
|
216
|
-
return True
|
|
217
216
|
|
|
218
217
|
if not isinstance(network, nn.Cell):
|
|
219
218
|
raise TypeError("network must be nn.Cell, but got {}.".format(type(network)))
|
|
@@ -283,13 +282,13 @@ def _obfuscate_single_ckpt(ckpt_name, obf_metadata, obf_config, saved_path):
|
|
|
283
282
|
def _obfuscate_param(param, obf_metadata, obf_ops, layer=0):
|
|
284
283
|
param_dtype = F.dtype(param)
|
|
285
284
|
obf_param = param
|
|
286
|
-
for
|
|
287
|
-
op_name =
|
|
285
|
+
for obf_op in obf_ops:
|
|
286
|
+
op_name = obf_op.get('name')
|
|
288
287
|
if not isinstance(op_name, str):
|
|
289
288
|
raise TypeError('{} should be str type, but got {}'.format(op_name, type(op_name)))
|
|
290
289
|
if op_name == 'mul':
|
|
291
290
|
input_x = obf_param
|
|
292
|
-
input_y_name = _get_op_input_name(
|
|
291
|
+
input_y_name = _get_op_input_name(obf_op, 'input_y', layer)
|
|
293
292
|
input_y = obf_metadata.get(input_y_name)
|
|
294
293
|
if input_x is None or input_y is None:
|
|
295
294
|
log.error("input_x or input_y is None")
|
|
@@ -297,22 +296,22 @@ def _obfuscate_single_ckpt(ckpt_name, obf_metadata, obf_config, saved_path):
|
|
|
297
296
|
input_y = F.cast(input_y, param_dtype)
|
|
298
297
|
obf_param = ops.mul(input_x, input_y)
|
|
299
298
|
elif op_name == 'permuate':
|
|
300
|
-
input_x_name = _get_op_input_name(
|
|
299
|
+
input_x_name = _get_op_input_name(obf_op, 'input_x', layer)
|
|
301
300
|
p = obf_metadata.get(input_x_name, None)
|
|
302
301
|
if p is None or obf_param is None:
|
|
303
302
|
log.error("input_x or param is None")
|
|
304
303
|
return None
|
|
305
304
|
obf_param = obf_param[p]
|
|
306
305
|
elif op_name == 'matmul':
|
|
307
|
-
input_x_name = _get_op_input_name(
|
|
308
|
-
input_y_name = _get_op_input_name(
|
|
306
|
+
input_x_name = _get_op_input_name(obf_op, 'input_x', layer)
|
|
307
|
+
input_y_name = _get_op_input_name(obf_op, 'input_y', layer)
|
|
309
308
|
input_x = _get_op_input(input_x_name, obf_param)
|
|
310
309
|
input_y = _get_op_input(input_y_name, obf_param)
|
|
311
310
|
if input_x is None or input_y is None:
|
|
312
311
|
log.error("the input_x or input_y of op: {} is None.".format(op_name))
|
|
313
312
|
return None
|
|
314
|
-
input_x = ops.transpose(input_x, (1, 0)) if
|
|
315
|
-
input_y = ops.transpose(input_y, (1, 0)) if
|
|
313
|
+
input_x = ops.transpose(input_x, (1, 0)) if obf_op.get('transpose_a', False) else input_x
|
|
314
|
+
input_y = ops.transpose(input_y, (1, 0)) if obf_op.get('transpose_b', False) else input_y
|
|
316
315
|
obf_param = ops.matmul(F.cast(input_x, param_dtype), F.cast(input_y, param_dtype))
|
|
317
316
|
else:
|
|
318
317
|
log.error("unsupported op, op must be matmul or permuate or mul, but got {}."
|
mindspore/swresample-4.dll
CHANGED
|
Binary file
|
mindspore/swscale-6.dll
CHANGED
|
Binary file
|
mindspore/tinyxml2.dll
CHANGED
|
Binary file
|
mindspore/train/_utils.py
CHANGED
|
@@ -582,7 +582,8 @@ def _progress_bar(iterable, total=None):
|
|
|
582
582
|
print_progress_bar(i)
|
|
583
583
|
|
|
584
584
|
|
|
585
|
-
def _load_and_transform(path, name_map, load_func, transform_func):
|
|
585
|
+
def _load_and_transform(path, name_map, load_func, transform_func=None):
|
|
586
|
+
"""use load_func to load and use transform_func to convert"""
|
|
586
587
|
if load_func is not None:
|
|
587
588
|
param_dict = load_func(path)
|
|
588
589
|
else:
|
|
@@ -590,5 +591,8 @@ def _load_and_transform(path, name_map, load_func, transform_func):
|
|
|
590
591
|
transform_dict = {}
|
|
591
592
|
for k, v in param_dict.items():
|
|
592
593
|
new_name = name_map.get(k, k) if name_map is not None else k
|
|
593
|
-
|
|
594
|
+
if transform_func is not None:
|
|
595
|
+
transform_dict[new_name] = transform_func(v, new_name)
|
|
596
|
+
else:
|
|
597
|
+
transform_dict[new_name] = v
|
|
594
598
|
return transform_dict
|
mindspore/train/amp.py
CHANGED
|
@@ -69,6 +69,9 @@ AMP_BLACK_LIST = [
|
|
|
69
69
|
AMP_AUTO_WHITE_LIST = [
|
|
70
70
|
P.Conv2D,
|
|
71
71
|
P.Conv3D,
|
|
72
|
+
gen.Conv2DExt,
|
|
73
|
+
gen.Conv3DExt,
|
|
74
|
+
gen.ConvTranspose2D,
|
|
72
75
|
P.Conv2DTranspose,
|
|
73
76
|
P.Conv3DTranspose,
|
|
74
77
|
gen.Convolution,
|
|
@@ -80,6 +83,10 @@ AMP_AUTO_WHITE_LIST = [
|
|
|
80
83
|
P.Einsum,
|
|
81
84
|
gen.Dense,
|
|
82
85
|
gen.Addmm,
|
|
86
|
+
gen.Addbmm,
|
|
87
|
+
gen.Addmv,
|
|
88
|
+
gen.Baddbmm,
|
|
89
|
+
gen.Mv,
|
|
83
90
|
]
|
|
84
91
|
|
|
85
92
|
AMP_AUTO_BLACK_LIST = [
|
|
@@ -90,8 +97,10 @@ AMP_AUTO_BLACK_LIST = [
|
|
|
90
97
|
P.Erfinv,
|
|
91
98
|
P.Exp,
|
|
92
99
|
P.Expm1,
|
|
93
|
-
|
|
94
|
-
|
|
100
|
+
gen.Log,
|
|
101
|
+
gen.Log10,
|
|
102
|
+
gen.Log1p,
|
|
103
|
+
gen.Log2,
|
|
95
104
|
P.Reciprocal,
|
|
96
105
|
P.Rsqrt,
|
|
97
106
|
P.Sinh,
|
|
@@ -103,6 +112,7 @@ AMP_AUTO_BLACK_LIST = [
|
|
|
103
112
|
P.BatchNorm,
|
|
104
113
|
gen.BatchNormExt,
|
|
105
114
|
gen.GroupNorm,
|
|
115
|
+
gen.Norm,
|
|
106
116
|
P.KLDivLoss,
|
|
107
117
|
P.SmoothL1Loss,
|
|
108
118
|
P.MultilabelMarginLoss,
|
|
@@ -113,7 +123,19 @@ AMP_AUTO_BLACK_LIST = [
|
|
|
113
123
|
P.Pdist,
|
|
114
124
|
P.Cdist,
|
|
115
125
|
P.Renorm,
|
|
126
|
+
gen.ReduceProd,
|
|
127
|
+
gen.Softmax,
|
|
128
|
+
gen.LogSoftmax,
|
|
129
|
+
gen.LogSoftmaxExt,
|
|
130
|
+
gen.CumProd,
|
|
131
|
+
gen.CumSum,
|
|
132
|
+
gen.CumsumExt,
|
|
133
|
+
gen.ProdExt,
|
|
134
|
+
gen.SumExt,
|
|
135
|
+
gen.L1LossExt,
|
|
116
136
|
gen.MSELossExt,
|
|
137
|
+
gen.NLLLoss,
|
|
138
|
+
gen.NLLLoss2d,
|
|
117
139
|
]
|
|
118
140
|
|
|
119
141
|
# Indicates which inputs of primitives need to be converted
|
|
@@ -358,7 +380,7 @@ def _auto_black_list(network, black_list, dtype):
|
|
|
358
380
|
return network
|
|
359
381
|
|
|
360
382
|
|
|
361
|
-
class
|
|
383
|
+
class AmpDecorator:
|
|
362
384
|
"""
|
|
363
385
|
Auto mixed precision decorator.
|
|
364
386
|
Type of lists: List[Tuple[str, List[int]]]
|
|
@@ -384,7 +406,7 @@ def _set_amp_decorator(obj, amp_level, amp_dtype, white_list, black_list):
|
|
|
384
406
|
if inspect.isfunction(obj) or inspect.ismethod(obj):
|
|
385
407
|
@functools.wraps(obj)
|
|
386
408
|
def wrapper(*args, **kwargs):
|
|
387
|
-
with
|
|
409
|
+
with AmpDecorator(amp_level, amp_dtype, white_list, black_list):
|
|
388
410
|
return obj(*args, **kwargs)
|
|
389
411
|
return wrapper
|
|
390
412
|
if isinstance(obj, nn.Cell):
|
|
@@ -423,17 +445,18 @@ def auto_mixed_precision(network, amp_level="O0", dtype=mstype.float16):
|
|
|
423
445
|
|
|
424
446
|
Operators in `auto_whitelist` are:
|
|
425
447
|
|
|
426
|
-
``Conv2D``, ``
|
|
427
|
-
``
|
|
448
|
+
``Conv2D``, ``Conv2DExt``, ``Conv3D``, ``Conv3DExt``, ``Conv2DTranspose``, ``ConvTranspose2D``,
|
|
449
|
+
``Conv3DTranspose``, ``Convolution``, ``MatMul``, ``MatMulExt``, ``BatchMatMul``, ``BatchMatMulExt``, ``PReLU``,
|
|
450
|
+
``Einsum``, ``Dense``, ``Addmm``, ``Addbmm``, ``Addmv``, ``Baddbmm``, ``Mv``
|
|
428
451
|
|
|
429
452
|
Operators in `auto_blacklist` are:
|
|
430
453
|
|
|
431
|
-
``Pow``, ``ACos``, ``Asin``, ``Cosh``, ``Erfinv``, ``Exp``, ``Expm1``, ``Log``, ``Log1p``, ``
|
|
432
|
-
``Rsqrt``, ``Sinh``, ``Tan``, ``Softplus``, ``SoftplusExt``, ``LayerNorm``, ``LayerNormExt``,
|
|
433
|
-
``BatchNormExt``, ``GroupNorm``, ``KLDivLoss``, ``SmoothL1Loss``, ``MultilabelMarginLoss``,
|
|
434
|
-
``TripletMarginLoss``, ``MultiMarginLoss``, ``BCEWithLogitsLoss``, ``Pdist``, ``Cdist``,
|
|
435
|
-
``ReduceProd``, ``Softmax``, ``LogSoftmax``, ``
|
|
436
|
-
``Norm``, ``MSELossExt``
|
|
454
|
+
``Pow``, ``ACos``, ``Asin``, ``Cosh``, ``Erfinv``, ``Exp``, ``Expm1``, ``Log``, ``Log10``, ``Log1p``, ``Log2``,
|
|
455
|
+
``Reciprocal``, ``Rsqrt``, ``Sinh``, ``Tan``, ``Softplus``, ``SoftplusExt``, ``LayerNorm``, ``LayerNormExt``,
|
|
456
|
+
``BatchNorm``, ``BatchNormExt``, ``GroupNorm``, ``KLDivLoss``, ``SmoothL1Loss``, ``MultilabelMarginLoss``,
|
|
457
|
+
``SoftMarginLoss``, ``TripletMarginLoss``, ``MultiMarginLoss``, ``BCEWithLogitsLoss``, ``Pdist``, ``Cdist``,
|
|
458
|
+
``Renorm``, ``ReduceProd``, ``Softmax``, ``LogSoftmax``, ``LogSoftmaxExt``, ``CumProd``, ``CumSum``,
|
|
459
|
+
``CumsumExt``, ``ProdExt``, ``SumExt``, ``Norm``, ``L1LossExt``, ``MSELossExt``, ``NLLLoss``, ``NLLLoss2d``
|
|
437
460
|
|
|
438
461
|
Operators in `promote_list` are:
|
|
439
462
|
|
|
@@ -638,7 +661,7 @@ def _add_loss_network(network, loss_fn, cast_model_type):
|
|
|
638
661
|
|
|
639
662
|
|
|
640
663
|
def _is_grad_accumulation(mcell):
|
|
641
|
-
if mcell.cls_name
|
|
664
|
+
if mcell.cls_name in {"GradAccumulationCell", "GradAccumulation"}:
|
|
642
665
|
return True
|
|
643
666
|
for cell in mcell.cells():
|
|
644
667
|
if _is_grad_accumulation(cell):
|
|
@@ -675,23 +698,23 @@ def build_train_network(network, optimizer, loss_fn=None, level='O0', boost_leve
|
|
|
675
698
|
Build the mixed precision training cell automatically.
|
|
676
699
|
|
|
677
700
|
Note:
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
701
|
+
After using `custom_mixed_precision` or `auto_mixed_precision` for precision conversion, it is not supported
|
|
702
|
+
to perform the precision conversion again. If `build_train_network` is used to train a converted network,
|
|
703
|
+
`level` need to be configured to ``O0`` to avoid the duplicated accuracy conversion.
|
|
681
704
|
|
|
682
705
|
Args:
|
|
683
706
|
network (Cell): Definition of the network.
|
|
684
707
|
optimizer (:class:`mindspore.nn.Optimizer`): Define the optimizer to update the Parameter.
|
|
685
|
-
loss_fn (Union[None, Cell]): Define the loss function. If None,
|
|
686
|
-
Default: ``None`` .
|
|
687
|
-
level (str): Supports ['O0', 'O1', 'O2', 'O3', 'auto']. Default: ``'O0'`` .
|
|
708
|
+
loss_fn (Union[None, Cell], optional): Define the loss function. If None,
|
|
709
|
+
the `network` should have the loss inside. Default: ``None`` .
|
|
710
|
+
level (str, optional): Supports ['O0', 'O1', 'O2', 'O3', 'auto']. Default: ``'O0'`` .
|
|
688
711
|
|
|
689
712
|
For details on amp level, refer to :func:`mindspore.amp.auto_mixed_precision`.
|
|
690
713
|
|
|
691
714
|
Property of `keep_batchnorm_fp32`, `cast_model_type` and `loss_scale_manager` determined by `level`
|
|
692
715
|
setting may be overwritten by settings in `kwargs`.
|
|
693
716
|
|
|
694
|
-
boost_level (str): Option for argument `level` in `mindspore.boost` , level for boost mode
|
|
717
|
+
boost_level (str, optional): Option for argument `level` in `mindspore.boost` , level for boost mode
|
|
695
718
|
training. Supports ['O0', 'O1', 'O2']. Default: ``'O0'`` .
|
|
696
719
|
|
|
697
720
|
- 'O0': Do not change.
|
|
@@ -15,6 +15,11 @@
|
|
|
15
15
|
"""Callback related classes and functions."""
|
|
16
16
|
from __future__ import absolute_import
|
|
17
17
|
|
|
18
|
+
__all__ = ["Callback", "LossMonitor", "TimeMonitor", "ModelCheckpoint", "FlopsUtilizationCollector",
|
|
19
|
+
"SummaryCollector", "CheckpointConfig", "RunContext", "LearningRateScheduler", "SummaryLandscape",
|
|
20
|
+
"History", "LambdaCallback", "ReduceLROnPlateau", "EarlyStopping", "OnRequestExit", "BackupAndRestore",
|
|
21
|
+
"TrainFaultTolerance"]
|
|
22
|
+
|
|
18
23
|
from mindspore.train.callback._callback import Callback
|
|
19
24
|
from mindspore.train.callback._callback import CallbackManager as _CallbackManager
|
|
20
25
|
from mindspore.train.callback._callback import InternalCallbackParam as _InternalCallbackParam
|
|
@@ -37,8 +42,3 @@ from mindspore.train.callback._on_request_exit import OnRequestExit
|
|
|
37
42
|
from mindspore.train.callback._backup_and_restore import BackupAndRestore
|
|
38
43
|
from mindspore.train.callback._flops_collector import FlopsUtilizationCollector
|
|
39
44
|
from mindspore.train.callback._train_fault_tolerance import TrainFaultTolerance
|
|
40
|
-
|
|
41
|
-
__all__ = ["Callback", "LossMonitor", "TimeMonitor", "ModelCheckpoint", "FlopsUtilizationCollector",
|
|
42
|
-
"SummaryCollector", "CheckpointConfig", "RunContext", "LearningRateScheduler", "SummaryLandscape",
|
|
43
|
-
"History", "LambdaCallback", "ReduceLROnPlateau", "EarlyStopping", "OnRequestExit", "BackupAndRestore",
|
|
44
|
-
"TrainFaultTolerance"]
|
|
@@ -411,8 +411,6 @@ class CheckpointConfig:
|
|
|
411
411
|
handle_append_info["epoch_num"] = 0
|
|
412
412
|
if "step_num" in append_info:
|
|
413
413
|
handle_append_info["step_num"] = 0
|
|
414
|
-
if "random_op" in append_info:
|
|
415
|
-
handle_append_info["random_op"] = 0
|
|
416
414
|
dict_num = 0
|
|
417
415
|
for element in append_info:
|
|
418
416
|
if not isinstance(element, str) and not isinstance(element, dict):
|
|
@@ -588,8 +586,6 @@ class ModelCheckpoint(Callback):
|
|
|
588
586
|
# save graph (only once)
|
|
589
587
|
if not self._graph_saved:
|
|
590
588
|
graph_file_name = os.path.join(self._directory, self._prefix + '-graph.meta')
|
|
591
|
-
if os.path.isfile(graph_file_name) and context.get_context("mode") == context.GRAPH_MODE:
|
|
592
|
-
os.remove(graph_file_name)
|
|
593
589
|
_save_graph(cb_params.train_network, graph_file_name)
|
|
594
590
|
self._graph_saved = True
|
|
595
591
|
self._save_ckpt(cb_params)
|
|
@@ -713,12 +709,13 @@ class ModelCheckpoint(Callback):
|
|
|
713
709
|
save_checkpoint(network, cur_file, False, self._config.async_save,
|
|
714
710
|
self._append_dict, self._config.enc_key, self._config.enc_mode,
|
|
715
711
|
crc_check=self._config.crc_check, format=self._config.format,
|
|
716
|
-
incremental=self._map_param_inc, choice_func=choice_func
|
|
712
|
+
incremental=self._map_param_inc, choice_func=choice_func,
|
|
713
|
+
remove_redundancy=self._config.remove_redundancy)
|
|
717
714
|
else:
|
|
718
715
|
save_checkpoint(network, cur_file, self._config.integrated_save, self._config.async_save,
|
|
719
716
|
self._append_dict, self._config.enc_key, self._config.enc_mode,
|
|
720
717
|
crc_check=self._config.crc_check, format=self._config.format,
|
|
721
|
-
incremental=self._map_param_inc)
|
|
718
|
+
incremental=self._map_param_inc, remove_redundancy=self._config.remove_redundancy)
|
|
722
719
|
|
|
723
720
|
self._latest_ckpt_file_name = cur_file
|
|
724
721
|
|
|
@@ -53,7 +53,7 @@ class FlopsUtilizationCollector(Callback):
|
|
|
53
53
|
The FlopsUtilizationCollector interface counts the model utilization information MFU
|
|
54
54
|
and the hardware utilization information HFU.
|
|
55
55
|
Currently, the API counts only the forward and backward flops of MatMul,
|
|
56
|
-
BatchMatMul,
|
|
56
|
+
BatchMatMul, flash_attention_score, and Conv2D operators.
|
|
57
57
|
Only used in graph mode with static shape.
|
|
58
58
|
|
|
59
59
|
Args:
|
|
@@ -404,7 +404,6 @@ class SummaryLandscape:
|
|
|
404
404
|
def _set_context(device_id):
|
|
405
405
|
"""Set context."""
|
|
406
406
|
context.set_context(device_id=device_id)
|
|
407
|
-
context.set_context(mode=context.GRAPH_MODE)
|
|
408
407
|
|
|
409
408
|
def _create_landscape_by_pca(self, epochs, proz, landscape_size, device_ids=None, callback_fn=None, executor=None):
|
|
410
409
|
"""Create landscape by PCA."""
|
|
@@ -25,8 +25,9 @@ from mindspore.communication import get_rank, get_group_size
|
|
|
25
25
|
from mindspore import log as logger
|
|
26
26
|
from mindspore.train.serialization import _get_cur_rank_dp
|
|
27
27
|
from mindspore._c_expression import _repair_device, _stop_device, _tft_sem_post, _tft_sem_enable
|
|
28
|
-
from mindspore._c_expression import _rebuild_world_group, _rebuild_sub_group, _finalize_comm
|
|
28
|
+
from mindspore._c_expression import _rebuild_world_group, _rebuild_sub_group, _finalize_comm, _clean_rootinfo
|
|
29
29
|
from mindspore._c_expression import clean_tdt_channel
|
|
30
|
+
from mindspore._c_expression import _pre_launch_send_recv
|
|
30
31
|
from mindspore._c_expression import send_recv, reset_params
|
|
31
32
|
from mindspore._c_expression import CollectiveManager
|
|
32
33
|
from mindspore._c_expression import _get_uce_process_strategy, _get_uce_mem_info
|
|
@@ -35,6 +36,7 @@ from mindspore.ops.operations.manually_defined._inner import TensorReport
|
|
|
35
36
|
import mindspore
|
|
36
37
|
import mindspore.common.dtype as mstype
|
|
37
38
|
from mindspore.parallel._recovery_context import _set_recovery_context
|
|
39
|
+
from mindspore import runtime
|
|
38
40
|
|
|
39
41
|
|
|
40
42
|
def _get_ckpt_dir(step, ckpt_save_path, is_tmp_file):
|
|
@@ -80,7 +82,7 @@ def _save_checkpoint_on_failure(step, save_info, args, cb_ctx):
|
|
|
80
82
|
append_dict["loss_scale"] = outputs[2]
|
|
81
83
|
|
|
82
84
|
ckpt_file = f"ttp_rank_{str(cur_rank)}-{str(cur_epoch_num)}_{str(step_num_in_epoch)}.ckpt"
|
|
83
|
-
cur_ckpt_dir = _get_ckpt_dir(step, ckpt_save_path, True)
|
|
85
|
+
cur_ckpt_dir = os.path.join(_get_ckpt_dir(step, ckpt_save_path, True), "rank_" + str(cur_rank))
|
|
84
86
|
os.makedirs(cur_ckpt_dir, exist_ok=True)
|
|
85
87
|
cur_file = os.path.join(cur_ckpt_dir, ckpt_file)
|
|
86
88
|
save_checkpoint(cb_params.train_network, cur_file,
|
|
@@ -110,7 +112,7 @@ def _tft_exit_cb(ctx):
|
|
|
110
112
|
|
|
111
113
|
def _tft_repair_callback(step, need_rebuild, error_ranks, repair_info, args, cb_ctx):
|
|
112
114
|
""" Callback used for TFT repair function."""
|
|
113
|
-
logger.warning("Enter _tft_repair_callback repair type: {
|
|
115
|
+
logger.warning(f"Enter _tft_repair_callback repair type: {repair_info['repair_type']}")
|
|
114
116
|
if (repair_info["repair_type"] in (cb_ctx.tft.RepairType.RT_UCE_HIGHLEVEL.value,
|
|
115
117
|
cb_ctx.tft.RepairType.RT_UCE_LOWLEVEL.value)):
|
|
116
118
|
logger.warning("Enter _tft_repair_callback uce REPARI_DEVICE device_id : {}".format(cb_ctx.device_id))
|
|
@@ -138,7 +140,7 @@ def _tft_repair_callback(step, need_rebuild, error_ranks, repair_info, args, cb_
|
|
|
138
140
|
|
|
139
141
|
def _tft_clean_callback(is_uce_error, args, ctx):
|
|
140
142
|
""" Callback used for TFT clean function."""
|
|
141
|
-
logger.warning("Enter _tft_clean_callback")
|
|
143
|
+
logger.warning(f"Enter _tft_clean_callback, device id:{ctx.device_id}")
|
|
142
144
|
ret = 0
|
|
143
145
|
if is_uce_error:
|
|
144
146
|
_get_uce_mem_info(ctx.device_id)
|
|
@@ -154,29 +156,36 @@ def _tft_clean_callback(is_uce_error, args, ctx):
|
|
|
154
156
|
logger.warning("Enter _tft_clean_callback resume_hccl_comm")
|
|
155
157
|
CollectiveManager.get_instance().resume_hccl_comm()
|
|
156
158
|
logger.warning("Finish _tft_clean_callback, ret: {}".format(ret))
|
|
159
|
+
if ctx.tft.tft_get_repair_type() == "recover":
|
|
160
|
+
logger.warning(f"Destroy hcom")
|
|
161
|
+
_finalize_comm()
|
|
162
|
+
logger.warning(f"Destroy hcom end")
|
|
157
163
|
return ret
|
|
158
164
|
|
|
159
165
|
|
|
160
166
|
def _tft_stop_callback(args, cb_ctx):
|
|
161
167
|
""" Callback used for TFT stop function."""
|
|
162
|
-
logger.warning("Enter _tft_stop_callback device_id: {
|
|
168
|
+
logger.warning(f"Enter _tft_stop_callback device_id: {cb_ctx.device_id}")
|
|
163
169
|
_stop_device(cb_ctx.device_id)
|
|
170
|
+
cb_ctx.stop_been_called = True
|
|
164
171
|
if (not cb_ctx.is_uce_rank) and (not cb_ctx._is_params_consistent()): # pylint: disable=W0212
|
|
165
172
|
raise RuntimeError("Can't stop device, because training parameters are left in inconsistent state!")
|
|
166
173
|
cb_ctx.is_uce_rank = False
|
|
167
174
|
if cb_ctx.tft.tft_get_repair_type() == "recover":
|
|
168
175
|
logger.warning(f"Reset limit step")
|
|
169
176
|
cb_ctx.tft.tft_reset_limit_step()
|
|
170
|
-
logger.
|
|
177
|
+
logger.warning("Finish _tft_stop_callback")
|
|
171
178
|
|
|
172
179
|
|
|
173
180
|
def _tft_rebuild_sub_groups(fault_ranks, args, ctx):
|
|
174
181
|
"""Callback used for TFT Rebuild Group function."""
|
|
175
|
-
logger.warning(f"Enter _tft_rebuild_sub_groups, device id:
|
|
176
|
-
_finalize_comm()
|
|
182
|
+
logger.warning(f"Enter _tft_rebuild_sub_groups, device id: {ctx.device_id}")
|
|
177
183
|
_rebuild_world_group()
|
|
178
184
|
_rebuild_sub_group()
|
|
179
185
|
_set_recovery_context(is_arf=True)
|
|
186
|
+
logger.warning(f"try to pre launch send recv before real launch")
|
|
187
|
+
_pre_launch_send_recv(context.get_context('device_id'))
|
|
188
|
+
logger.warning(f"Pre launch send recv before real launch end")
|
|
180
189
|
logger.warning("Enter _tft_rebuild_sub_groups ok ")
|
|
181
190
|
|
|
182
191
|
|
|
@@ -299,6 +308,12 @@ class TrainFaultTolerance(Callback):
|
|
|
299
308
|
|
|
300
309
|
def __init__(self, ckpt_save_path=None, **kwargs):
|
|
301
310
|
super(TrainFaultTolerance, self).__init__()
|
|
311
|
+
logger.info(f"MS_ENABLE_TFT: {os.getenv('MS_ENABLE_TFT', '')}")
|
|
312
|
+
if self._only_enable_tsp():
|
|
313
|
+
self.tft = _tft_handler.get_tft()
|
|
314
|
+
self._check_init()
|
|
315
|
+
self.tft.tft_register_stream_sync_handler(runtime.synchronize, self)
|
|
316
|
+
return
|
|
302
317
|
self.save_cb = kwargs.get("ckpt_save_fn", None)
|
|
303
318
|
self.ckpt_save_path = ckpt_save_path
|
|
304
319
|
if self.save_cb is None and self.ckpt_save_path is None:
|
|
@@ -308,19 +323,24 @@ class TrainFaultTolerance(Callback):
|
|
|
308
323
|
self.device_id = context.get_context("device_id")
|
|
309
324
|
self.cur_step_num = 0
|
|
310
325
|
self.cur_epoch_num = 0
|
|
326
|
+
self.clean_unique_id = False
|
|
311
327
|
# For TREError(Training Result Error) scene, parameter `ckpt_load_fn` must be provided to load checkpoint
|
|
312
328
|
# from file for resuming training, the `ckpt_load_fn` is a function, prototype of which is:
|
|
313
329
|
# `def load_checkpoint() -> tuple(dict, bool)`, the return value is a tuple containing 2 values,
|
|
314
330
|
# i.e. (param_dict, remove_redundancy)
|
|
315
331
|
self.ckpt_load_func = kwargs.get("ckpt_load_fn", None)
|
|
316
|
-
self.tft = _tft_handler.get_tft()
|
|
317
332
|
if self._only_enable_tre():
|
|
318
333
|
return
|
|
334
|
+
self.tft = _tft_handler.get_tft()
|
|
319
335
|
self._check_init()
|
|
336
|
+
if self._only_enable_tre_and_tsp():
|
|
337
|
+
self.tft.tft_register_stream_sync_handler(runtime.synchronize, self)
|
|
338
|
+
return
|
|
320
339
|
self.global_step = None
|
|
321
340
|
self.learning_rate = None
|
|
322
341
|
self.has_init_replica = False
|
|
323
342
|
self.is_uce_rank = False
|
|
343
|
+
self.stop_been_called = False
|
|
324
344
|
|
|
325
345
|
self.assign = mindspore.ops.Assign()
|
|
326
346
|
self.g_one = Parameter(Tensor([1], dtype=mstype.int32))
|
|
@@ -336,6 +356,22 @@ class TrainFaultTolerance(Callback):
|
|
|
336
356
|
return False
|
|
337
357
|
return "TRE:1" in env_enable
|
|
338
358
|
|
|
359
|
+
def _only_enable_tsp(self):
|
|
360
|
+
"""Check if only configured MS_ENABLE_TFT='{TSP:1}'"""
|
|
361
|
+
env_enable = os.getenv("MS_ENABLE_TFT", "")
|
|
362
|
+
non_tsp_flags = ["TTP:1", "UCE:1", "ARF:1", "TRE:1"]
|
|
363
|
+
if any(flag in env_enable for flag in non_tsp_flags):
|
|
364
|
+
return False
|
|
365
|
+
return "TSP:1" in env_enable
|
|
366
|
+
|
|
367
|
+
def _only_enable_tre_and_tsp(self):
|
|
368
|
+
"""Check if only configured MS_ENABLE_TFT='{TRE:1, TSP:1}'"""
|
|
369
|
+
env_enable = os.getenv("MS_ENABLE_TFT", "")
|
|
370
|
+
other_flags = ["TTP:1", "UCE:1", "ARF:1"]
|
|
371
|
+
if any(flag in env_enable for flag in other_flags):
|
|
372
|
+
return False
|
|
373
|
+
return "TRE:1" in env_enable and "TSP:1" in env_enable
|
|
374
|
+
|
|
339
375
|
def _check_init(self):
|
|
340
376
|
"""Check if the mindio-ttp had inited"""
|
|
341
377
|
if self.tft is None:
|
|
@@ -427,6 +463,8 @@ class TrainFaultTolerance(Callback):
|
|
|
427
463
|
self.tft.tft_register_clean_handler(_tft_clean_callback, self)
|
|
428
464
|
self.tft.tft_register_repair_handler(_tft_repair_callback, self)
|
|
429
465
|
self.tft.tft_register_rebuild_group_handler(_tft_rebuild_sub_groups, self)
|
|
466
|
+
if "TSP:1" in os.getenv("MS_ENABLE_TFT", ""):
|
|
467
|
+
self.tft.tft_register_stream_sync_handler(runtime.synchronize, self)
|
|
430
468
|
|
|
431
469
|
def _reset_acc_grads(self):
|
|
432
470
|
accu_grad_params = map(lambda e: e[1],
|
|
@@ -436,6 +474,12 @@ class TrainFaultTolerance(Callback):
|
|
|
436
474
|
if reset_params(accu_grad_list) != 0:
|
|
437
475
|
raise ValueError("Call reset_params failed.")
|
|
438
476
|
|
|
477
|
+
def _clear_unique_id(self):
|
|
478
|
+
"""Clean unique id on first train step end"""
|
|
479
|
+
if not self.clean_unique_id and ("ARF:1" in os.getenv("MS_ENABLE_TFT", "")):
|
|
480
|
+
_clean_rootinfo()
|
|
481
|
+
self.clean_unique_id = True
|
|
482
|
+
|
|
439
483
|
def on_train_step_end(self, run_context):
|
|
440
484
|
"""
|
|
441
485
|
Report status to MindIO TFT after every step finished.
|
|
@@ -446,13 +490,19 @@ class TrainFaultTolerance(Callback):
|
|
|
446
490
|
"""
|
|
447
491
|
if self._only_enable_tre():
|
|
448
492
|
return
|
|
449
|
-
|
|
450
|
-
self.has_init_replica = True
|
|
451
|
-
self._set_tft_optimizer_replica(run_context)
|
|
493
|
+
|
|
452
494
|
cb_params = run_context.original_args()
|
|
453
495
|
logger.info("START Set optimizer finish step status to TFT. step: {}".format(cb_params.cur_step_num))
|
|
454
496
|
self.cur_step_num = cb_params.cur_step_num
|
|
455
497
|
self.cur_epoch_num = cb_params.cur_epoch_num
|
|
498
|
+
if self._only_enable_tsp() or self._only_enable_tre_and_tsp():
|
|
499
|
+
logger.info("Go into tft_pause_train.")
|
|
500
|
+
self.tft.tft_pause_train(self.cur_step_num)
|
|
501
|
+
return
|
|
502
|
+
|
|
503
|
+
if self.has_init_replica is False:
|
|
504
|
+
self.has_init_replica = True
|
|
505
|
+
self._set_tft_optimizer_replica(run_context)
|
|
456
506
|
if cb_params.optimizer is not None:
|
|
457
507
|
self.global_step = cb_params.optimizer.global_step.clone()
|
|
458
508
|
self.assign(cb_params.optimizer.tft_g_one_flag, self.g_one)
|
|
@@ -462,7 +512,13 @@ class TrainFaultTolerance(Callback):
|
|
|
462
512
|
else:
|
|
463
513
|
raise ValueError("TFT feature need optimizer or network's optimizer!")
|
|
464
514
|
self.tft.tft_end_updating_os(cb_params.cur_step_num + self.initial_step)
|
|
515
|
+
if cb_params.is_arf:
|
|
516
|
+
self.clean_unique_id = False
|
|
517
|
+
self._clear_unique_id()
|
|
465
518
|
logger.info("END Set optimizer finish step status to TFT.")
|
|
519
|
+
if "TSP:1" in os.getenv("MS_ENABLE_TFT", ""):
|
|
520
|
+
logger.info("Go into tft_pause_train.")
|
|
521
|
+
self.tft.tft_pause_train(self.cur_step_num)
|
|
466
522
|
|
|
467
523
|
def on_train_begin(self, run_context):
|
|
468
524
|
"""
|
|
@@ -472,6 +528,8 @@ class TrainFaultTolerance(Callback):
|
|
|
472
528
|
run_context (RunContext): Context of the train running. Refer to
|
|
473
529
|
:class:`mindspore.train.RunContext` for detail.
|
|
474
530
|
"""
|
|
531
|
+
if self._only_enable_tsp():
|
|
532
|
+
return
|
|
475
533
|
cb_params = run_context.original_args()
|
|
476
534
|
if self._only_enable_tre():
|
|
477
535
|
self.cb_params = cb_params
|
|
@@ -491,6 +549,6 @@ class TrainFaultTolerance(Callback):
|
|
|
491
549
|
run_context (RunContext): Context of the train running. Refer to
|
|
492
550
|
:class:`mindspore.train.RunContext` for detail.
|
|
493
551
|
"""
|
|
494
|
-
if self._only_enable_tre():
|
|
552
|
+
if self._only_enable_tre() or self._only_enable_tsp() or self._only_enable_tre_and_tsp():
|
|
495
553
|
return
|
|
496
554
|
_tft_handler.unregister_tft()
|
mindspore/train/data_sink.py
CHANGED
|
@@ -18,7 +18,7 @@ import mindspore.ops as ops
|
|
|
18
18
|
from mindspore import context
|
|
19
19
|
from mindspore.common.dtype import pytype_to_dtype
|
|
20
20
|
from mindspore.common.api import jit
|
|
21
|
-
from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes
|
|
21
|
+
from mindspore.train._utils import _exec_datagraph, _get_types_and_shapes, enable_data_broadcast
|
|
22
22
|
from mindspore.train.dataset_helper import _has_dynamic_shape, _check_inputs
|
|
23
23
|
import mindspore.dataset as ds
|
|
24
24
|
from mindspore._c_expression import _set_dataset_mode_config
|
|
@@ -41,6 +41,15 @@ def _init_sink_dataset(dataset, sink_size, input_signature, create_info):
|
|
|
41
41
|
is_info_queue = (create_info and sink_size == 1 and dataset_size != 1 and
|
|
42
42
|
input_signature is None and not dynamic_shape and
|
|
43
43
|
context.get_context('device_target') == 'Ascend')
|
|
44
|
+
|
|
45
|
+
# Don't enable dynamic shape(multi-subgraph) feature in pp/data_broadcast mode,
|
|
46
|
+
# otherwise get_data_info will stuck since some rank do not consume data.
|
|
47
|
+
use_pipeline_parallel = (context.get_auto_parallel_context("pipeline_stages") > 1)
|
|
48
|
+
data_broadcast = enable_data_broadcast()
|
|
49
|
+
|
|
50
|
+
if use_pipeline_parallel or data_broadcast:
|
|
51
|
+
is_info_queue = False
|
|
52
|
+
|
|
44
53
|
transfer_dataset = _exec_datagraph(dataset, sink_size, create_data_info_queue=is_info_queue)
|
|
45
54
|
dataset.__transfer_dataset__ = transfer_dataset
|
|
46
55
|
|
|
@@ -214,7 +223,7 @@ def data_sink(fn, dataset, sink_size=1, jit_config=None, input_signature=None):
|
|
|
214
223
|
loop = sink_size
|
|
215
224
|
create_info = True
|
|
216
225
|
if jit_config is None:
|
|
217
|
-
create_info =
|
|
226
|
+
create_info = loop == 1
|
|
218
227
|
loop = 1
|
|
219
228
|
ori_next_op, is_info_queue = _init_sink_dataset(dataset, loop, input_signature, create_info)
|
|
220
229
|
|
|
@@ -564,6 +564,15 @@ class _DatasetIter:
|
|
|
564
564
|
self.sink_size = dataset.__loop_size__
|
|
565
565
|
create_data_info_queue = (
|
|
566
566
|
sink_size == 1 and self.sink_count == 1 and dataset.get_dataset_size() != 1)
|
|
567
|
+
|
|
568
|
+
# Don't enable dynamic shape(multi-subgraph) feature in pp/data_broadcast mode,
|
|
569
|
+
# otherwise get_data_info will stuck since some rank do not consume data.
|
|
570
|
+
use_pipeline_parallel = (context.get_auto_parallel_context("pipeline_stages") > 1)
|
|
571
|
+
data_broadcast = enable_data_broadcast()
|
|
572
|
+
|
|
573
|
+
if use_pipeline_parallel or data_broadcast:
|
|
574
|
+
create_data_info_queue = False
|
|
575
|
+
|
|
567
576
|
dataset.__transfer_dataset__ = _exec_datagraph(dataset, self.sink_size,
|
|
568
577
|
create_data_info_queue=create_data_info_queue)
|
|
569
578
|
|