mindspore 2.6.0rc1__cp39-cp39-win_amd64.whl → 2.7.0rc1__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/__init__.py +1 -1
- mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +40 -9
- mindspore/{_deprecated → _extends/optimize}/__init__.py +9 -3
- mindspore/_extends/optimize/cell_utils.py +96 -0
- mindspore/_extends/parse/__init__.py +2 -2
- mindspore/_extends/parse/compile_config.py +44 -22
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +1 -1
- mindspore/_extends/parse/parser.py +37 -62
- mindspore/_extends/parse/resources.py +39 -0
- mindspore/_extends/parse/standard_method.py +43 -13
- mindspore/_extends/parse/trope.py +8 -1
- mindspore/_extends/pijit/__init__.py +1 -2
- mindspore/amp.py +4 -4
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/boost/adasum.py +1 -1
- mindspore/boost/boost_cell_wrapper.py +4 -4
- mindspore/common/__init__.py +27 -2
- mindspore/common/_grad_function.py +2 -1
- mindspore/common/_pijit_context.py +28 -7
- mindspore/common/_stub_tensor.py +1 -209
- mindspore/common/_tensor_cpp_method.py +1 -1
- mindspore/common/_tensor_docs.py +77 -16
- mindspore/common/api.py +238 -113
- mindspore/common/dtype.py +21 -11
- mindspore/common/dump.py +10 -15
- mindspore/common/generator.py +5 -3
- mindspore/common/hook_handle.py +11 -2
- mindspore/common/jit_config.py +1 -1
- mindspore/common/jit_trace.py +84 -105
- mindspore/common/parameter.py +26 -12
- mindspore/common/recompute.py +3 -3
- mindspore/common/sparse_tensor.py +0 -3
- mindspore/common/symbol.py +0 -1
- mindspore/common/tensor.py +81 -81
- mindspore/communication/_comm_helper.py +46 -4
- mindspore/communication/management.py +79 -7
- mindspore/context.py +58 -40
- mindspore/dataset/core/config.py +3 -3
- mindspore/dataset/engine/datasets.py +20 -7
- mindspore/dataset/engine/datasets_user_defined.py +33 -3
- mindspore/dataset/engine/iterators.py +2 -2
- mindspore/dataset/engine/obs/config_loader.py +2 -2
- mindspore/dataset/engine/obs/obs_mindrecord_dataset.py +8 -0
- mindspore/dataset/transforms/py_transforms.py +7 -3
- mindspore/dataset/transforms/transforms.py +7 -3
- mindspore/dataset/vision/validators.py +1 -0
- mindspore/device_context/ascend/device.py +1 -1
- mindspore/device_context/gpu/__init__.py +2 -2
- mindspore/device_context/gpu/device.py +1 -1
- mindspore/device_context/gpu/op_precision.py +4 -2
- mindspore/device_context/gpu/op_tuning.py +6 -3
- mindspore/device_manager.py +16 -9
- mindspore/dnnl.dll +0 -0
- mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +3 -7
- mindspore/experimental/llm_boost/atb/boost_base.py +2 -3
- mindspore/experimental/optim/adadelta.py +13 -20
- mindspore/experimental/optim/adagrad.py +15 -22
- mindspore/experimental/optim/adam.py +17 -24
- mindspore/experimental/optim/adamax.py +14 -22
- mindspore/experimental/optim/adamw.py +28 -34
- mindspore/experimental/optim/asgd.py +15 -25
- mindspore/experimental/optim/lr_scheduler.py +27 -45
- mindspore/experimental/optim/nadam.py +14 -24
- mindspore/experimental/optim/optimizer.py +13 -23
- mindspore/experimental/optim/radam.py +18 -24
- mindspore/experimental/optim/rmsprop.py +14 -25
- mindspore/experimental/optim/rprop.py +15 -26
- mindspore/experimental/optim/sgd.py +9 -19
- mindspore/hal/__init__.py +4 -4
- mindspore/hal/contiguous_tensors_handle.py +2 -2
- mindspore/hal/memory.py +27 -7
- mindspore/include/api/cell.h +37 -1
- mindspore/include/api/delegate.h +10 -0
- mindspore/include/api/model.h +3 -0
- mindspore/include/api/types.h +2 -2
- mindspore/include/c_api/model_c.h +0 -58
- mindspore/include/c_api/tensor_c.h +0 -26
- mindspore/include/dataset/vision_ascend.h +1 -1
- mindspore/jpeg62.dll +0 -0
- mindspore/mindrecord/tools/cifar10.py +60 -11
- mindspore/mindrecord/tools/cifar10_to_mr.py +5 -0
- mindspore/mindspore_backend_common.dll +0 -0
- mindspore/mindspore_backend_manager.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_cpu_res_manager.dll +0 -0
- mindspore/mindspore_dump.dll +0 -0
- mindspore/mindspore_frontend.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_memory_pool.dll +0 -0
- mindspore/mindspore_ms_backend.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/mindspore_ops_host.dll +0 -0
- mindspore/mindspore_ops_kernel_common.dll +0 -0
- mindspore/mindspore_profiler.dll +0 -0
- mindspore/mindspore_pyboost.dll +0 -0
- mindspore/mindspore_pynative.dll +0 -0
- mindspore/mindspore_res_manager.dll +0 -0
- mindspore/mindspore_runtime_pipeline.dll +0 -0
- mindspore/mint/__init__.py +6 -46
- mindspore/mint/distributed/__init__.py +1 -0
- mindspore/mint/distributed/distributed.py +212 -9
- mindspore/mint/nn/__init__.py +1 -1
- mindspore/mint/nn/functional.py +53 -6
- mindspore/mint/nn/layer/_functions.py +164 -294
- mindspore/mint/nn/layer/activation.py +8 -6
- mindspore/mint/nn/layer/conv.py +137 -101
- mindspore/mint/nn/layer/normalization.py +8 -22
- mindspore/mint/optim/adam.py +19 -18
- mindspore/mint/optim/adamw.py +14 -8
- mindspore/mint/optim/sgd.py +5 -5
- mindspore/nn/cell.py +328 -502
- mindspore/nn/grad/cell_grad.py +11 -12
- mindspore/nn/layer/activation.py +32 -34
- mindspore/nn/layer/basic.py +67 -64
- mindspore/nn/layer/channel_shuffle.py +4 -4
- mindspore/nn/layer/combined.py +4 -2
- mindspore/nn/layer/conv.py +117 -110
- mindspore/nn/layer/dense.py +9 -7
- mindspore/nn/layer/embedding.py +50 -52
- mindspore/nn/layer/image.py +37 -39
- mindspore/nn/layer/math.py +111 -112
- mindspore/nn/layer/normalization.py +56 -44
- mindspore/nn/layer/pooling.py +58 -63
- mindspore/nn/layer/rnn_cells.py +33 -33
- mindspore/nn/layer/rnns.py +56 -56
- mindspore/nn/layer/thor_layer.py +74 -73
- mindspore/nn/layer/transformer.py +11 -1
- mindspore/nn/learning_rate_schedule.py +20 -20
- mindspore/nn/loss/loss.py +79 -81
- mindspore/nn/optim/adam.py +3 -3
- mindspore/nn/optim/adasum.py +2 -2
- mindspore/nn/optim/asgd.py +2 -0
- mindspore/nn/optim/optimizer.py +1 -1
- mindspore/nn/optim/thor.py +2 -2
- mindspore/nn/probability/distribution/exponential.py +2 -1
- mindspore/nn/probability/distribution/poisson.py +2 -1
- mindspore/nn/sparse/sparse.py +3 -3
- mindspore/nn/wrap/cell_wrapper.py +34 -37
- mindspore/nn/wrap/grad_reducer.py +37 -37
- mindspore/nn/wrap/loss_scale.py +72 -74
- mindspore/numpy/array_creations.py +5 -5
- mindspore/numpy/fft.py +1 -1
- mindspore/numpy/math_ops.py +5 -5
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/_grad_experimental/grad_comm_ops.py +51 -13
- mindspore/ops/_grad_experimental/grad_debug_ops.py +14 -0
- mindspore/ops/_vmap/vmap_array_ops.py +31 -13
- mindspore/ops/_vmap/vmap_nn_ops.py +8 -16
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +42 -11
- mindspore/ops/auto_generate/gen_extend_func.py +23 -141
- mindspore/ops/auto_generate/gen_ops_def.py +727 -321
- mindspore/ops/auto_generate/gen_ops_prim.py +1721 -984
- mindspore/ops/auto_generate/pyboost_inner_prim.py +31 -1
- mindspore/ops/composite/__init__.py +10 -0
- mindspore/ops/composite/base.py +8 -4
- mindspore/ops/composite/multitype_ops/__init__.py +12 -1
- mindspore/ops/composite/multitype_ops/_compile_utils.py +133 -109
- mindspore/ops/composite/multitype_ops/add_impl.py +70 -2
- mindspore/ops/composite/multitype_ops/div_impl.py +49 -0
- mindspore/ops/composite/multitype_ops/floordiv_impl.py +29 -0
- mindspore/ops/composite/multitype_ops/getitem_impl.py +11 -0
- mindspore/ops/composite/multitype_ops/mod_impl.py +5 -3
- mindspore/ops/composite/multitype_ops/mul_impl.py +49 -0
- mindspore/ops/composite/multitype_ops/setitem_impl.py +57 -0
- mindspore/ops/composite/multitype_ops/sub_impl.py +34 -0
- mindspore/ops/composite/multitype_ops/zeros_like_impl.py +14 -0
- mindspore/ops/function/__init__.py +3 -1
- mindspore/ops/function/_add_attr_func.py +11 -6
- mindspore/ops/function/array_func.py +9 -96
- mindspore/ops/function/debug_func.py +4 -3
- mindspore/ops/function/grad/grad_func.py +1 -1
- mindspore/ops/function/math_func.py +33 -540
- mindspore/ops/function/nn_func.py +28 -74
- mindspore/ops/function/other_func.py +4 -1
- mindspore/ops/function/random_func.py +44 -5
- mindspore/ops/function/vmap_func.py +2 -1
- mindspore/ops/functional.py +2 -3
- mindspore/ops/functional_overload.py +571 -6
- mindspore/ops/op_info_register.py +21 -0
- mindspore/ops/operations/__init__.py +16 -11
- mindspore/ops/operations/_custom_ops_utils.py +689 -34
- mindspore/ops/operations/_inner_ops.py +3 -6
- mindspore/ops/operations/_sequence_ops.py +1 -1
- mindspore/ops/operations/array_ops.py +2 -2
- mindspore/ops/operations/comm_ops.py +185 -26
- mindspore/ops/operations/custom_ops.py +294 -174
- mindspore/ops/operations/debug_ops.py +59 -4
- mindspore/ops/operations/image_ops.py +13 -13
- mindspore/ops/operations/manually_defined/ops_def.py +15 -16
- mindspore/ops/operations/math_ops.py +3 -4
- mindspore/ops/operations/nn_ops.py +7 -39
- mindspore/ops/primitive.py +6 -10
- mindspore/ops/tensor_method.py +47 -8
- mindspore/ops_generate/api/cpp_create_prim_instance_helper_generator.py +1 -1
- mindspore/ops_generate/api/functional_map_cpp_generator.py +10 -9
- mindspore/ops_generate/api/functions_cc_generator.py +58 -10
- mindspore/ops_generate/api/tensor_func_reg_cpp_generator.py +1 -1
- mindspore/ops_generate/common/base_generator.py +14 -0
- mindspore/ops_generate/common/gen_constants.py +8 -3
- mindspore/ops_generate/common/gen_utils.py +0 -19
- mindspore/ops_generate/common/op_proto.py +11 -4
- mindspore/ops_generate/common/template.py +88 -11
- mindspore/ops_generate/gen_ops.py +1 -1
- mindspore/ops_generate/op_def/lite_ops_cpp_generator.py +4 -4
- mindspore/ops_generate/op_def/ops_def_cc_generator.py +0 -3
- mindspore/ops_generate/op_def/ops_name_h_generator.py +0 -3
- mindspore/ops_generate/op_def/ops_primitive_h_generator.py +0 -4
- mindspore/ops_generate/op_def_py/op_prim_py_generator.py +5 -2
- mindspore/ops_generate/pyboost/auto_grad_impl_cc_generator.py +49 -8
- mindspore/ops_generate/pyboost/auto_grad_reg_cc_generator.py +2 -2
- mindspore/ops_generate/pyboost/gen_pyboost_func.py +31 -0
- mindspore/ops_generate/pyboost/op_template_parser.py +98 -72
- mindspore/ops_generate/pyboost/pyboost_functions_cpp_generator.py +70 -273
- mindspore/ops_generate/pyboost/pyboost_functions_h_generator.py +14 -6
- mindspore/ops_generate/pyboost/pyboost_functions_impl_cpp_generator.py +316 -0
- mindspore/ops_generate/pyboost/pyboost_functions_py_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_grad_function_cpp_generator.py +5 -3
- mindspore/ops_generate/pyboost/pyboost_inner_prim_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_internal_functions_cpp_generator.py +76 -0
- mindspore/ops_generate/pyboost/pyboost_internal_functions_h_generator.py +76 -0
- mindspore/ops_generate/pyboost/pyboost_internal_kernel_info_adapter_generator.py +125 -0
- mindspore/ops_generate/pyboost/pyboost_native_grad_functions_generator.py +4 -3
- mindspore/ops_generate/pyboost/pyboost_op_cpp_code_generator.py +348 -61
- mindspore/ops_generate/pyboost/pyboost_overload_functions_cpp_generator.py +1 -1
- mindspore/ops_generate/pyboost/pyboost_utils.py +118 -9
- mindspore/ops_generate/tensor_py_cc_generator.py +1 -24
- mindspore/parallel/_auto_parallel_context.py +11 -8
- mindspore/parallel/_cell_wrapper.py +113 -45
- mindspore/parallel/_parallel_serialization.py +1 -1
- mindspore/parallel/_ps_context.py +4 -6
- mindspore/parallel/_tensor.py +167 -12
- mindspore/parallel/_transformer/moe.py +1 -1
- mindspore/parallel/_transformer/transformer.py +13 -8
- mindspore/parallel/auto_parallel.py +14 -7
- mindspore/parallel/checkpoint_convert.py +3 -3
- mindspore/parallel/checkpoint_transform.py +11 -7
- mindspore/parallel/cluster/process_entity/_api.py +84 -48
- mindspore/parallel/cluster/process_entity/_utils.py +95 -7
- mindspore/parallel/cluster/run.py +43 -4
- mindspore/parallel/function/__init__.py +8 -1
- mindspore/parallel/function/reshard_func.py +6 -7
- mindspore/parallel/nn/__init__.py +15 -2
- mindspore/parallel/nn/parallel_cell_wrapper.py +9 -10
- mindspore/parallel/nn/parallel_grad_reducer.py +7 -6
- mindspore/parallel/shard.py +3 -4
- mindspore/parallel/transform_safetensors.py +463 -174
- mindspore/profiler/__init__.py +2 -1
- mindspore/profiler/analysis/parser/timeline_assembly_factory/ascend_timeline_assembler.py +7 -7
- mindspore/profiler/analysis/parser/timeline_assembly_factory/base_timeline_assembler.py +3 -0
- mindspore/profiler/analysis/parser/timeline_assembly_factory/trace_view_container.py +12 -6
- mindspore/profiler/analysis/parser/timeline_creator/cpu_op_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_creator/fwk_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_creator/msprof_timeline_creator.py +4 -4
- mindspore/profiler/analysis/parser/timeline_creator/scope_layer_timeline_creator.py +3 -3
- mindspore/profiler/analysis/parser/timeline_event/fwk_event.py +4 -1
- mindspore/profiler/analysis/parser/timeline_event/timeline_event_pool.py +2 -1
- mindspore/profiler/analysis/task_manager.py +1 -1
- mindspore/profiler/analysis/viewer/ascend_communication_viewer.py +5 -1
- mindspore/profiler/analysis/viewer/ascend_integrate_viewer.py +2 -1
- mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +42 -22
- mindspore/profiler/analysis/viewer/ascend_step_trace_time_viewer.py +3 -2
- mindspore/profiler/analysis/viewer/ms_minddata_viewer.py +9 -5
- mindspore/profiler/analysis/viewer/ms_operator_details_viewer.py +132 -0
- mindspore/profiler/common/constant.py +16 -0
- mindspore/profiler/common/profiler_context.py +25 -27
- mindspore/profiler/common/profiler_info.py +0 -16
- mindspore/profiler/common/profiler_op_analyse.py +235 -0
- mindspore/profiler/common/profiler_output_path.py +23 -8
- mindspore/profiler/common/profiler_parameters.py +128 -35
- mindspore/profiler/dynamic_profile/__init__.py +0 -0
- mindspore/profiler/dynamic_profile/dynamic_monitor_proxy.py +39 -0
- mindspore/profiler/dynamic_profile/dynamic_profiler_config_context.py +666 -0
- mindspore/profiler/dynamic_profile/dynamic_profiler_utils.py +62 -0
- mindspore/profiler/dynamic_profiler.py +305 -314
- mindspore/profiler/envprofiler.py +12 -7
- mindspore/profiler/experimental_config.py +96 -6
- mindspore/profiler/mstx.py +33 -12
- mindspore/profiler/platform/__init__.py +2 -3
- mindspore/profiler/platform/npu_profiler.py +29 -19
- mindspore/profiler/profiler.py +35 -19
- mindspore/profiler/profiler_action_controller.py +64 -76
- mindspore/profiler/schedule.py +10 -4
- mindspore/rewrite/common/config.py +1 -0
- mindspore/rewrite/common/namer.py +1 -0
- mindspore/rewrite/common/namespace.py +1 -0
- mindspore/rewrite/node/node.py +31 -11
- mindspore/rewrite/parsers/assign_parser.py +1 -1
- mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
- mindspore/run_check/_check_version.py +7 -10
- mindspore/runtime/__init__.py +5 -5
- mindspore/runtime/event.py +10 -4
- mindspore/runtime/executor.py +60 -45
- mindspore/runtime/memory.py +30 -32
- mindspore/runtime/thread_bind_core.py +298 -164
- mindspore/safeguard/rewrite_obfuscation.py +12 -13
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/_utils.py +14 -4
- mindspore/train/amp.py +43 -20
- mindspore/train/callback/__init__.py +5 -5
- mindspore/train/callback/_checkpoint.py +3 -6
- mindspore/train/callback/_flops_collector.py +1 -1
- mindspore/train/callback/_landscape.py +0 -1
- mindspore/train/callback/_train_fault_tolerance.py +97 -16
- mindspore/train/data_sink.py +11 -2
- mindspore/train/dataset_helper.py +9 -0
- mindspore/train/model.py +135 -55
- mindspore/train/serialization.py +133 -111
- mindspore/train/summary/summary_record.py +13 -2
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +3 -2
- mindspore/utils/dryrun.py +0 -6
- mindspore/utils/runtime_execution_order_check.py +163 -77
- mindspore/utils/sdc_detect.py +68 -0
- mindspore/utils/utils.py +6 -9
- mindspore/version.py +1 -1
- {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0rc1.dist-info}/METADATA +5 -4
- {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0rc1.dist-info}/RECORD +333 -371
- mindspore/_deprecated/jit.py +0 -198
- mindspore/experimental/es/__init__.py +0 -22
- mindspore/experimental/es/embedding_service.py +0 -891
- mindspore/experimental/es/embedding_service_layer.py +0 -581
- mindspore/profiler/parser/__init__.py +0 -14
- mindspore/profiler/parser/aicpu_data_parser.py +0 -272
- mindspore/profiler/parser/ascend_analysis/__init__.py +0 -14
- mindspore/profiler/parser/ascend_analysis/constant.py +0 -71
- mindspore/profiler/parser/ascend_analysis/file_manager.py +0 -180
- mindspore/profiler/parser/ascend_analysis/function_event.py +0 -185
- mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +0 -136
- mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +0 -131
- mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +0 -104
- mindspore/profiler/parser/ascend_analysis/path_manager.py +0 -313
- mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +0 -123
- mindspore/profiler/parser/ascend_analysis/tlv_decoder.py +0 -86
- mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +0 -75
- mindspore/profiler/parser/ascend_cluster_generator.py +0 -116
- mindspore/profiler/parser/ascend_communicate_generator.py +0 -314
- mindspore/profiler/parser/ascend_flops_generator.py +0 -116
- mindspore/profiler/parser/ascend_fpbp_generator.py +0 -82
- mindspore/profiler/parser/ascend_hccl_generator.py +0 -271
- mindspore/profiler/parser/ascend_integrate_generator.py +0 -42
- mindspore/profiler/parser/ascend_memory_generator.py +0 -185
- mindspore/profiler/parser/ascend_msprof_exporter.py +0 -282
- mindspore/profiler/parser/ascend_msprof_generator.py +0 -187
- mindspore/profiler/parser/ascend_op_generator.py +0 -334
- mindspore/profiler/parser/ascend_steptrace_generator.py +0 -94
- mindspore/profiler/parser/ascend_timeline_generator.py +0 -545
- mindspore/profiler/parser/base_timeline_generator.py +0 -483
- mindspore/profiler/parser/container.py +0 -229
- mindspore/profiler/parser/cpu_gpu_timeline_generator.py +0 -697
- mindspore/profiler/parser/flops_parser.py +0 -531
- mindspore/profiler/parser/framework_enum.py +0 -111
- mindspore/profiler/parser/framework_parser.py +0 -464
- mindspore/profiler/parser/framework_struct.py +0 -61
- mindspore/profiler/parser/gpu_analysis/__init__.py +0 -14
- mindspore/profiler/parser/gpu_analysis/function_event.py +0 -44
- mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +0 -89
- mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +0 -72
- mindspore/profiler/parser/hccl_parser.py +0 -573
- mindspore/profiler/parser/hwts_log_parser.py +0 -122
- mindspore/profiler/parser/integrator.py +0 -526
- mindspore/profiler/parser/memory_usage_parser.py +0 -277
- mindspore/profiler/parser/minddata_analyzer.py +0 -800
- mindspore/profiler/parser/minddata_parser.py +0 -186
- mindspore/profiler/parser/minddata_pipeline_parser.py +0 -299
- mindspore/profiler/parser/op_intermediate_parser.py +0 -149
- mindspore/profiler/parser/optime_parser.py +0 -250
- mindspore/profiler/parser/profiler_info.py +0 -213
- mindspore/profiler/parser/step_trace_parser.py +0 -666
- {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0rc1.dist-info}/WHEEL +0 -0
- {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0rc1.dist-info}/entry_points.txt +0 -0
- {mindspore-2.6.0rc1.dist-info → mindspore-2.7.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -15,36 +15,23 @@
|
|
|
15
15
|
"""Dynamic Profile Monitor"""
|
|
16
16
|
import os
|
|
17
17
|
import sys
|
|
18
|
+
import json
|
|
18
19
|
import time
|
|
19
20
|
import stat
|
|
20
|
-
import json
|
|
21
21
|
import atexit
|
|
22
|
-
import struct
|
|
23
22
|
import random
|
|
24
23
|
import multiprocessing
|
|
25
24
|
|
|
26
25
|
from mindspore import log as logger
|
|
27
26
|
from mindspore.train import Callback
|
|
28
27
|
from mindspore.profiler import Profiler, tensorboard_trace_handler, schedule
|
|
29
|
-
from mindspore.
|
|
30
|
-
from mindspore.profiler.
|
|
31
|
-
from mindspore.profiler.
|
|
32
|
-
from mindspore.profiler.
|
|
33
|
-
from mindspore.profiler.
|
|
34
|
-
ProfilerActivity,
|
|
35
|
-
ProfilerLevel,
|
|
36
|
-
AicoreMetrics,
|
|
37
|
-
ExportType,
|
|
38
|
-
)
|
|
28
|
+
from mindspore.profiler.common.file_manager import FileManager
|
|
29
|
+
from mindspore.profiler.common.path_manager import PathManager
|
|
30
|
+
from mindspore.profiler.dynamic_profile.dynamic_profiler_config_context import DynamicProfilerConfigContext
|
|
31
|
+
from mindspore.profiler.dynamic_profile.dynamic_monitor_proxy import MsDynamicMonitorProxySingleton
|
|
32
|
+
from mindspore.profiler.dynamic_profile.dynamic_profiler_utils import DynamicProfilerUtils
|
|
39
33
|
from mindspore.profiler.common.util import no_exception_func
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def get_real_rank():
|
|
43
|
-
"""get rank id"""
|
|
44
|
-
try:
|
|
45
|
-
return get_rank()
|
|
46
|
-
except RuntimeError:
|
|
47
|
-
return int(os.getenv("RANK_ID", "0"))
|
|
34
|
+
from mindspore.profiler.profiler_interface import ProfilerInterface
|
|
48
35
|
|
|
49
36
|
|
|
50
37
|
def print_msg(msg):
|
|
@@ -52,210 +39,21 @@ def print_msg(msg):
|
|
|
52
39
|
print("[Dynamic Profiler] " + msg, flush=True)
|
|
53
40
|
|
|
54
41
|
|
|
55
|
-
class DynamicProfilerArgs:
|
|
56
|
-
"""
|
|
57
|
-
Data class for dynamic profile config.
|
|
58
|
-
"""
|
|
59
|
-
FMT = "i" * 7 + "?" * 6
|
|
60
|
-
SIZE = struct.calcsize(FMT)
|
|
61
|
-
|
|
62
|
-
def __init__(self,
|
|
63
|
-
start_step: int = -1,
|
|
64
|
-
stop_step: int = -1,
|
|
65
|
-
aic_metrics: int = -1,
|
|
66
|
-
profiler_level: int = 0,
|
|
67
|
-
analyse_mode: int = -1,
|
|
68
|
-
activities: int = 0,
|
|
69
|
-
export_type: int = 0,
|
|
70
|
-
profile_memory: bool = False,
|
|
71
|
-
mstx: bool = False,
|
|
72
|
-
parallel_strategy: bool = False,
|
|
73
|
-
with_stack: bool = False,
|
|
74
|
-
data_simplification: bool = True,
|
|
75
|
-
is_valid: bool = False,
|
|
76
|
-
**kwargs):
|
|
77
|
-
self._start_step = start_step
|
|
78
|
-
self._stop_step = stop_step
|
|
79
|
-
self._aic_metrics = aic_metrics
|
|
80
|
-
self._profiler_level = profiler_level
|
|
81
|
-
self._analyse_mode = analyse_mode
|
|
82
|
-
self._activities = activities
|
|
83
|
-
self._export_type = export_type
|
|
84
|
-
self._profile_memory = profile_memory
|
|
85
|
-
self._mstx = mstx
|
|
86
|
-
self._parallel_strategy = parallel_strategy
|
|
87
|
-
self._with_stack = with_stack
|
|
88
|
-
self._data_simplification = data_simplification
|
|
89
|
-
self._is_valid = is_valid
|
|
90
|
-
self._check_params_type()
|
|
91
|
-
|
|
92
|
-
def _check_params_type(self):
|
|
93
|
-
"""Check and enforce parameter types with lower complexity."""
|
|
94
|
-
# Define a parameter check rule. {Parameter name: (expected type, default value)}
|
|
95
|
-
param_rules = {
|
|
96
|
-
'_start_step': (int, -1),
|
|
97
|
-
'_stop_step': (int, -1),
|
|
98
|
-
'_aic_metrics': (int, -1),
|
|
99
|
-
'_profiler_level': (int, 0),
|
|
100
|
-
'_analyse_mode': (int, -1),
|
|
101
|
-
'_activities': (int, 0),
|
|
102
|
-
'_export_type': (int, 0),
|
|
103
|
-
'_profile_memory': (bool, False),
|
|
104
|
-
'_mstx': (bool, False),
|
|
105
|
-
'_parallel_strategy': (bool, False),
|
|
106
|
-
'_with_stack': (bool, False),
|
|
107
|
-
'_data_simplification': (bool, True),
|
|
108
|
-
'_is_valid': (bool, False)
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
def _is_valid_type(value, expected_type):
|
|
112
|
-
"""Helper method for type checking."""
|
|
113
|
-
if expected_type is int and isinstance(value, bool):
|
|
114
|
-
return False
|
|
115
|
-
return isinstance(value, expected_type)
|
|
116
|
-
|
|
117
|
-
for param, (expected_type, default) in param_rules.items():
|
|
118
|
-
value = getattr(self, param)
|
|
119
|
-
if not _is_valid_type(value, expected_type):
|
|
120
|
-
logger.warning(
|
|
121
|
-
f"{param[1:]} should be {expected_type.__name__} type, "
|
|
122
|
-
f"will be reset to {default}."
|
|
123
|
-
)
|
|
124
|
-
setattr(self, param, default)
|
|
125
|
-
|
|
126
|
-
@property
|
|
127
|
-
def start_step(self):
|
|
128
|
-
""" get start step value."""
|
|
129
|
-
return self._start_step
|
|
130
|
-
|
|
131
|
-
@property
|
|
132
|
-
def stop_step(self):
|
|
133
|
-
""" get stop step value."""
|
|
134
|
-
return self._stop_step
|
|
135
|
-
|
|
136
|
-
@property
|
|
137
|
-
def is_valid(self):
|
|
138
|
-
""" get json valid value."""
|
|
139
|
-
return self._is_valid
|
|
140
|
-
|
|
141
|
-
@is_valid.setter
|
|
142
|
-
def is_valid(self, value):
|
|
143
|
-
""" set json valid value."""
|
|
144
|
-
self._is_valid = value
|
|
145
|
-
|
|
146
|
-
@property
|
|
147
|
-
def analyse_mode(self):
|
|
148
|
-
""" get analyse mode value."""
|
|
149
|
-
return self._convert_analyse_mode(self._analyse_mode)
|
|
150
|
-
|
|
151
|
-
@property
|
|
152
|
-
def vars(self):
|
|
153
|
-
""" get all values in DynamicProfilerArgs."""
|
|
154
|
-
not_supported_args = ['_is_valid']
|
|
155
|
-
res = {}
|
|
156
|
-
for key, value in self.__dict__.items():
|
|
157
|
-
if key not in not_supported_args:
|
|
158
|
-
res[key.replace('_', '', 1)] = value
|
|
159
|
-
return res
|
|
160
|
-
|
|
161
|
-
@property
|
|
162
|
-
def args(self):
|
|
163
|
-
""" get all args in DynamicProfilerArgs."""
|
|
164
|
-
self._profiler_level = self._convert_profiler_level(self._profiler_level)
|
|
165
|
-
self._activities = self._convert_activities(self._activities)
|
|
166
|
-
self._aic_metrics = self._convert_aic_metrics(self._aic_metrics)
|
|
167
|
-
self._export_type = self._convert_export_type(self._export_type)
|
|
168
|
-
not_supported_args = ['_start_step', '_stop_step', '_analyse_mode', '_is_valid']
|
|
169
|
-
res = {}
|
|
170
|
-
for key, value in self.__dict__.items():
|
|
171
|
-
if key not in not_supported_args:
|
|
172
|
-
res[key.replace('_', '', 1)] = value
|
|
173
|
-
return res
|
|
174
|
-
|
|
175
|
-
@classmethod
|
|
176
|
-
def from_bytes(cls, byte_data):
|
|
177
|
-
""" unpack bytes to DynamicProfilerArgs."""
|
|
178
|
-
unpacked = struct.unpack(cls.FMT, byte_data)
|
|
179
|
-
return cls(*unpacked)
|
|
180
|
-
|
|
181
|
-
def to_bytes(self):
|
|
182
|
-
""" pack DynamicProfilerArgs to bytes."""
|
|
183
|
-
instance_vars = tuple(self.__dict__.values())
|
|
184
|
-
if len(instance_vars) != len(self.FMT):
|
|
185
|
-
raise ValueError("Number of variables does not match format string.")
|
|
186
|
-
return struct.pack(DynamicProfilerArgs.FMT, *instance_vars)
|
|
187
|
-
|
|
188
|
-
def _convert_analyse_mode(self, analyse_mode: int) -> str:
|
|
189
|
-
""" convert analyse_mode to real args in Profiler."""
|
|
190
|
-
if analyse_mode == 0:
|
|
191
|
-
return 'sync'
|
|
192
|
-
if analyse_mode == 1:
|
|
193
|
-
return 'async'
|
|
194
|
-
return None
|
|
195
|
-
|
|
196
|
-
def _convert_profiler_level(self, profiler_level: int) -> ProfilerLevel:
|
|
197
|
-
""" convert profiler_level to real args in Profiler."""
|
|
198
|
-
if profiler_level == -1:
|
|
199
|
-
return ProfilerLevel.LevelNone
|
|
200
|
-
if profiler_level == 0:
|
|
201
|
-
return ProfilerLevel.Level0
|
|
202
|
-
if profiler_level == 1:
|
|
203
|
-
return ProfilerLevel.Level1
|
|
204
|
-
if profiler_level == 2:
|
|
205
|
-
return ProfilerLevel.Level2
|
|
206
|
-
return ProfilerLevel.Level0
|
|
207
|
-
|
|
208
|
-
def _convert_activities(self, activities: int) -> ProfilerLevel:
|
|
209
|
-
""" convert activities to real args in Profiler."""
|
|
210
|
-
if activities == 0:
|
|
211
|
-
return [ProfilerActivity.CPU, ProfilerActivity.NPU]
|
|
212
|
-
if activities == 1:
|
|
213
|
-
return [ProfilerActivity.CPU]
|
|
214
|
-
if activities == 2:
|
|
215
|
-
return [ProfilerActivity.NPU]
|
|
216
|
-
return [ProfilerActivity.CPU, ProfilerActivity.NPU]
|
|
217
|
-
|
|
218
|
-
def _convert_aic_metrics(self, aic_metrics: int) -> AicoreMetrics:
|
|
219
|
-
""" convert aic_metrics to real args in Profiler."""
|
|
220
|
-
if aic_metrics == -1:
|
|
221
|
-
return AicoreMetrics.AiCoreNone
|
|
222
|
-
if aic_metrics == 0:
|
|
223
|
-
return AicoreMetrics.PipeUtilization
|
|
224
|
-
if aic_metrics == 1:
|
|
225
|
-
return AicoreMetrics.ArithmeticUtilization
|
|
226
|
-
if aic_metrics == 2:
|
|
227
|
-
return AicoreMetrics.Memory
|
|
228
|
-
if aic_metrics == 3:
|
|
229
|
-
return AicoreMetrics.MemoryL0
|
|
230
|
-
if aic_metrics == 4:
|
|
231
|
-
return AicoreMetrics.MemoryUB
|
|
232
|
-
if aic_metrics == 5:
|
|
233
|
-
return AicoreMetrics.ResourceConflictRatio
|
|
234
|
-
if aic_metrics == 6:
|
|
235
|
-
return AicoreMetrics.L2Cache
|
|
236
|
-
if aic_metrics == 7:
|
|
237
|
-
return AicoreMetrics.MemoryAccess
|
|
238
|
-
return AicoreMetrics.AiCoreNone
|
|
239
|
-
|
|
240
|
-
def _convert_export_type(self, export_type: int) -> ExportType:
|
|
241
|
-
""" convert export_type to real args in Profiler."""
|
|
242
|
-
if export_type == 0:
|
|
243
|
-
return [ExportType.Text]
|
|
244
|
-
if export_type == 1:
|
|
245
|
-
return [ExportType.Db]
|
|
246
|
-
if export_type == 2:
|
|
247
|
-
return [ExportType.Text, ExportType.Db]
|
|
248
|
-
return [ExportType.Text]
|
|
249
|
-
|
|
250
42
|
class DynamicProfilerMonitorBase(Callback):
|
|
251
43
|
"""
|
|
252
44
|
Dynamic profile callback base class implementing the dynamic profile functionality.
|
|
253
45
|
"""
|
|
254
46
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
self.
|
|
47
|
+
NPU_MONITOR_START = "NPU_MONITOR_START"
|
|
48
|
+
|
|
49
|
+
def __init__(self, cfg_path=None, output_path=None, poll_interval=2, **kwargs):
|
|
50
|
+
self._is_dyno = DynamicProfilerUtils.is_dyno_mode()
|
|
51
|
+
self._rank_id = DynamicProfilerUtils.get_real_rank()
|
|
52
|
+
if not self._is_dyno:
|
|
53
|
+
self._cfg_path = cfg_path
|
|
54
|
+
self._cfg_json_path = os.path.join(self._cfg_path, "profiler_config.json")
|
|
55
|
+
self._cfg_json_path = os.path.realpath(self._cfg_json_path)
|
|
56
|
+
self._init_cfg_json()
|
|
259
57
|
self._output_path = "dyn_profile_data" if output_path is None else output_path
|
|
260
58
|
self._poll_interval = poll_interval
|
|
261
59
|
if not isinstance(self._poll_interval, int):
|
|
@@ -268,7 +66,6 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
268
66
|
|
|
269
67
|
self._kwargs = kwargs
|
|
270
68
|
self._shm_name = time.strftime("DynamicProfileShm%Y%m%d%H", time.localtime())
|
|
271
|
-
self._rank_id = get_real_rank()
|
|
272
69
|
self._shared_loop_flag = multiprocessing.Value('b', True)
|
|
273
70
|
self._shm = None
|
|
274
71
|
self._process = None
|
|
@@ -282,10 +79,11 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
282
79
|
self._step_num = 0
|
|
283
80
|
|
|
284
81
|
self._check_shm_for_killed()
|
|
285
|
-
self._init_cfg_json()
|
|
286
82
|
self._create_shm()
|
|
287
83
|
self._create_process()
|
|
288
84
|
atexit.register(self._clean_resource)
|
|
85
|
+
if self._is_dyno:
|
|
86
|
+
atexit.register(self._finalize_dynolog)
|
|
289
87
|
|
|
290
88
|
@no_exception_func()
|
|
291
89
|
def step_begin(self, run_context):
|
|
@@ -295,13 +93,13 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
295
93
|
Args:
|
|
296
94
|
run_context (RunContext): Context of the train running.
|
|
297
95
|
"""
|
|
298
|
-
|
|
299
|
-
|
|
96
|
+
prof_json = self._get_prof_args()
|
|
97
|
+
prof_args = DynamicProfilerConfigContext(prof_json)
|
|
300
98
|
if not prof_args.is_valid:
|
|
301
99
|
logger.error("Dynamic profile json is not valid, please check the json file.")
|
|
302
100
|
return
|
|
303
101
|
|
|
304
|
-
if prof_args.start_step
|
|
102
|
+
if prof_args.start_step in (-1, self._last_start_step):
|
|
305
103
|
return
|
|
306
104
|
|
|
307
105
|
cb_params = run_context.original_args()
|
|
@@ -338,7 +136,8 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
338
136
|
Args:
|
|
339
137
|
run_context (RunContext): Context of the train running.
|
|
340
138
|
"""
|
|
341
|
-
|
|
139
|
+
prof_json = self._get_prof_args()
|
|
140
|
+
prof_args = DynamicProfilerConfigContext(prof_json)
|
|
342
141
|
|
|
343
142
|
if not prof_args.is_valid:
|
|
344
143
|
logger.error("Dynamic profile json is not valid, please check the json file.")
|
|
@@ -415,19 +214,28 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
415
214
|
... context.set_context(mode=mindspore.PYNATIVE_MODE)
|
|
416
215
|
... mindspore.set_device("Ascend")
|
|
417
216
|
... data_cfg = {
|
|
418
|
-
...
|
|
419
|
-
...
|
|
420
|
-
...
|
|
421
|
-
...
|
|
422
|
-
...
|
|
423
|
-
...
|
|
424
|
-
...
|
|
425
|
-
...
|
|
426
|
-
...
|
|
427
|
-
...
|
|
428
|
-
...
|
|
429
|
-
...
|
|
430
|
-
...
|
|
217
|
+
... "start_step": 2,
|
|
218
|
+
... "stop_step": 5,
|
|
219
|
+
... "aic_metrics": "AiCoreNone",
|
|
220
|
+
... "profiler_level": "Level0",
|
|
221
|
+
... "analyse_mode": 0,
|
|
222
|
+
... "activities": ["CPU", "NPU"],
|
|
223
|
+
... "export_type": ["text"],
|
|
224
|
+
... "profile_memory": False,
|
|
225
|
+
... "mstx": False,
|
|
226
|
+
... "parallel_strategy": False,
|
|
227
|
+
... "with_stack": False,
|
|
228
|
+
... "data_simplification": True,
|
|
229
|
+
... "l2_cache": False,
|
|
230
|
+
... "analyse": True,
|
|
231
|
+
... "record_shape": False,
|
|
232
|
+
... "prof_path": "./data",
|
|
233
|
+
... "mstx_domain_include": [],
|
|
234
|
+
... "mstx_domain_exclude": [],
|
|
235
|
+
... "host_sys": [],
|
|
236
|
+
... "sys_io": False,
|
|
237
|
+
... "sys_interconnection": False
|
|
238
|
+
... }
|
|
431
239
|
... output_path = "./cfg_path"
|
|
432
240
|
... cfg_path = os.path.join(output_path, "profiler_config.json")
|
|
433
241
|
... os.makedirs(output_path, exist_ok=True)
|
|
@@ -442,7 +250,8 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
442
250
|
... for i in range(STEP_NUM):
|
|
443
251
|
... print(f"step {i}")
|
|
444
252
|
... train(net)
|
|
445
|
-
... # Modify the configuration file after step 7
|
|
253
|
+
... # Modify the configuration file after step 7
|
|
254
|
+
... # For example, change start_step to 8 and stop_step to 10
|
|
446
255
|
... if i == 5:
|
|
447
256
|
... # Modify parameters in the JSON file
|
|
448
257
|
... change_cfg_json(os.path.join(output_path, "profiler_config.json"))
|
|
@@ -451,43 +260,96 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
451
260
|
"""
|
|
452
261
|
|
|
453
262
|
self._step_num += 1
|
|
454
|
-
|
|
263
|
+
prof_json = self._get_prof_args()
|
|
264
|
+
if not prof_json:
|
|
265
|
+
return
|
|
266
|
+
if self._is_dyno:
|
|
267
|
+
# Dyno monitor process
|
|
268
|
+
if self.NPU_MONITOR_START in prof_json:
|
|
269
|
+
self._call_dyno_monitor(prof_json)
|
|
270
|
+
return
|
|
455
271
|
|
|
272
|
+
prof_args = DynamicProfilerConfigContext(prof_json)
|
|
456
273
|
if not prof_args.is_valid:
|
|
457
|
-
logger.error("Dynamic profile
|
|
274
|
+
logger.error("Dynamic profile config is not valid, please check the json or dyno config.")
|
|
458
275
|
return
|
|
276
|
+
self._handle_profiler_setup(prof_args)
|
|
459
277
|
|
|
460
|
-
if
|
|
461
|
-
|
|
278
|
+
if self._profiler:
|
|
279
|
+
self._profiler.step()
|
|
280
|
+
|
|
281
|
+
def _handle_profiler_setup(self, args):
|
|
282
|
+
"""Common handler for profiler setup logic shared between dyno and non-dyno paths."""
|
|
283
|
+
start_step = args.start_step
|
|
284
|
+
stop_step = args.stop_step
|
|
462
285
|
|
|
463
|
-
|
|
464
|
-
if self._step_num < prof_args.start_step:
|
|
286
|
+
if not self._is_valid_start_stop_step(self._step_num, start_step, stop_step):
|
|
465
287
|
return
|
|
466
288
|
|
|
467
|
-
if self._start_step !=
|
|
468
|
-
|
|
469
|
-
self.
|
|
470
|
-
|
|
471
|
-
if
|
|
472
|
-
prof_path = os.path.join(self._output_path,
|
|
473
|
-
f"rank{self._rank_id}_start{self._start_step}_stop{self._stop_step}")
|
|
474
|
-
print_msg(f"Rank {self._rank_id} create output path {prof_path}")
|
|
475
|
-
print_msg(f"Rank {self._rank_id} Dynamic profile start at step {self._start_step}, "
|
|
476
|
-
f"will stop at step {self._stop_step}")
|
|
477
|
-
self._profiler = Profiler(schedule=schedule(wait=0, warmup=0,
|
|
478
|
-
active=self._stop_step - self._start_step + 1,
|
|
479
|
-
repeat=1,
|
|
480
|
-
skip_first=1),
|
|
481
|
-
on_trace_ready=tensorboard_trace_handler(dir_name=prof_path),
|
|
482
|
-
**prof_args.args)
|
|
483
|
-
else:
|
|
289
|
+
if self._start_step != start_step or self._stop_step != stop_step:
|
|
290
|
+
self._start_step = start_step
|
|
291
|
+
self._stop_step = stop_step
|
|
292
|
+
|
|
293
|
+
if not (start_step >= 0 and 0 <= start_step <= stop_step):
|
|
484
294
|
self._profiler = None
|
|
485
|
-
logger.error(
|
|
486
|
-
|
|
487
|
-
|
|
295
|
+
logger.error(
|
|
296
|
+
"Rank %d Dynamic profile start at step %d and stop at step %d must be "
|
|
297
|
+
"greater than or equal to 0, and stop step should not be less than start step",
|
|
298
|
+
self._rank_id, start_step, stop_step
|
|
299
|
+
)
|
|
300
|
+
return
|
|
488
301
|
|
|
489
|
-
|
|
490
|
-
|
|
302
|
+
# Setup profiler configuration
|
|
303
|
+
active_steps = stop_step - start_step + 1
|
|
304
|
+
output_path = args.prof_path if args.prof_path != "./" else self._output_path
|
|
305
|
+
prof_path = os.path.join(
|
|
306
|
+
output_path,
|
|
307
|
+
f"rank{self._rank_id}_start{start_step}_stop{stop_step}"
|
|
308
|
+
)
|
|
309
|
+
print_msg(f"Rank {self._rank_id} create output path {prof_path}")
|
|
310
|
+
print_msg(
|
|
311
|
+
f"Rank {self._rank_id} Dynamic profile start at step {start_step}, "
|
|
312
|
+
f"will stop at step {stop_step}"
|
|
313
|
+
)
|
|
314
|
+
profiler_config = {
|
|
315
|
+
"schedule": schedule(
|
|
316
|
+
wait=0,
|
|
317
|
+
warmup=0,
|
|
318
|
+
active=active_steps,
|
|
319
|
+
repeat=1,
|
|
320
|
+
skip_first=1
|
|
321
|
+
),
|
|
322
|
+
"on_trace_ready": tensorboard_trace_handler(
|
|
323
|
+
dir_name=prof_path,
|
|
324
|
+
analyse_flag=args.analyse,
|
|
325
|
+
async_mode=args.analyse_mode == "async",
|
|
326
|
+
),
|
|
327
|
+
**args.args
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
self._profiler = Profiler(**profiler_config)
|
|
331
|
+
|
|
332
|
+
def _is_valid_start_stop_step(self, step_num, start_step, stop_step):
|
|
333
|
+
"""Verify whether start_step and stop_step are valid parameters."""
|
|
334
|
+
if start_step < 0 or stop_step < 0:
|
|
335
|
+
return False
|
|
336
|
+
|
|
337
|
+
if step_num < start_step:
|
|
338
|
+
return False
|
|
339
|
+
|
|
340
|
+
if step_num > stop_step != self._stop_step:
|
|
341
|
+
logger.warning("stop_step must be greater than step_num, "
|
|
342
|
+
"but get start_step = %d, stop_step = %d, step_num = %d", start_step, stop_step, step_num)
|
|
343
|
+
return False
|
|
344
|
+
|
|
345
|
+
return True
|
|
346
|
+
|
|
347
|
+
@no_exception_func()
|
|
348
|
+
def _call_dyno_monitor(self, dyno_args):
|
|
349
|
+
if "is_valid" in dyno_args:
|
|
350
|
+
del dyno_args["is_valid"]
|
|
351
|
+
dyno_monitor_proxy = MsDynamicMonitorProxySingleton().get_proxy()
|
|
352
|
+
dyno_monitor_proxy.enable_dyno_npu_monitor(dyno_args)
|
|
491
353
|
|
|
492
354
|
@no_exception_func()
|
|
493
355
|
def on_train_end(self, run_context):
|
|
@@ -502,12 +364,16 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
502
364
|
def _get_prof_args(self):
|
|
503
365
|
""" Get prof_args """
|
|
504
366
|
logger.error("Dynamic profiler _get_prof_args is not implemented")
|
|
505
|
-
return
|
|
367
|
+
return {}
|
|
506
368
|
|
|
507
369
|
def _clean_resource(self):
|
|
508
370
|
"""Clean resource"""
|
|
509
371
|
logger.error("Dynamic profiler _clean_resource is not implemented")
|
|
510
372
|
|
|
373
|
+
def _finalize_dynolog(self):
|
|
374
|
+
"""finalize dynolog"""
|
|
375
|
+
logger.error("Dynolog monitor _finalize_dynolog is not implemented")
|
|
376
|
+
|
|
511
377
|
def _check_step(self, start_step, stop_step, step_num):
|
|
512
378
|
"""Check step valid"""
|
|
513
379
|
if start_step <= 0 or stop_step <= 0:
|
|
@@ -535,9 +401,11 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
535
401
|
"""Init config json file"""
|
|
536
402
|
if self._rank_id == 0:
|
|
537
403
|
if not os.path.exists(self._cfg_json_path):
|
|
538
|
-
logger.
|
|
539
|
-
|
|
540
|
-
|
|
404
|
+
logger.info("cfg_path is not exist, create default cfg json")
|
|
405
|
+
default_dy_config_context = DynamicProfilerConfigContext({})
|
|
406
|
+
PathManager.make_dir_safety(self._cfg_path)
|
|
407
|
+
config_file_path = os.path.join(self._cfg_path, "profiler_config.json")
|
|
408
|
+
FileManager.create_json_file(config_file_path, default_dy_config_context.vars, indent=4)
|
|
541
409
|
else:
|
|
542
410
|
logger.info("rank_id is not 0, skip init cfg json")
|
|
543
411
|
print_msg(f"Init config json file: {self._cfg_json_path}")
|
|
@@ -550,10 +418,12 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
550
418
|
def _create_process(self):
|
|
551
419
|
"""Create json monitor process, one process will be created at one worker"""
|
|
552
420
|
if self._is_create_process:
|
|
421
|
+
args = [self._shared_loop_flag, self._poll_interval, self._shm, self._rank_id] if self._is_dyno else \
|
|
422
|
+
[self._shared_loop_flag, self._poll_interval, self._shm, self._cfg_json_path]
|
|
553
423
|
# daemon need to be set to True, otherwise the process will not be killed when the main process exits.
|
|
554
|
-
self._process = multiprocessing.Process(target=worker_func,
|
|
555
|
-
|
|
556
|
-
|
|
424
|
+
self._process = multiprocessing.Process(target=worker_dyno_func if self._is_dyno else worker_func,
|
|
425
|
+
daemon=True,
|
|
426
|
+
args=args)
|
|
557
427
|
self._process.start()
|
|
558
428
|
logger.info("Config monitor process has been created by rank %d.", self._rank_id)
|
|
559
429
|
else:
|
|
@@ -573,7 +443,7 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
573
443
|
if not os.path.exists(shm_path):
|
|
574
444
|
return
|
|
575
445
|
|
|
576
|
-
MAX_TIME_DIFF =
|
|
446
|
+
MAX_TIME_DIFF = 60 # seconds
|
|
577
447
|
time_shm = os.stat(shm_path).st_ctime
|
|
578
448
|
cur_proc_time = self._get_pid_st_ctime(os.getpid())
|
|
579
449
|
|
|
@@ -584,7 +454,7 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
584
454
|
def _get_pid_st_ctime(self, pid):
|
|
585
455
|
"""Get pid st_ctime"""
|
|
586
456
|
try:
|
|
587
|
-
fd = os.open(
|
|
457
|
+
fd = os.open(os.path.join('/proc', str(pid)), os.O_RDONLY, stat.S_IRUSR | stat.S_IRGRP)
|
|
588
458
|
stat_ino = os.fstat(fd)
|
|
589
459
|
os.close(fd)
|
|
590
460
|
create_time = stat_ino.st_ctime
|
|
@@ -593,7 +463,7 @@ class DynamicProfilerMonitorBase(Callback):
|
|
|
593
463
|
logger.error("Process with PID %d does not exist.", pid)
|
|
594
464
|
except PermissionError:
|
|
595
465
|
logger.error("Permission denied when accessing PID %d.", pid)
|
|
596
|
-
except Exception as ex:
|
|
466
|
+
except Exception as ex: # pylint: disable=W0703
|
|
597
467
|
logger.error("An error occurred while getting creation time for PID %d: %s", pid, str(ex))
|
|
598
468
|
|
|
599
469
|
|
|
@@ -601,7 +471,8 @@ if sys.version_info >= (3, 8):
|
|
|
601
471
|
@no_exception_func()
|
|
602
472
|
def write_bytes(shm, byte_data):
|
|
603
473
|
"""Write bytes to shared memory"""
|
|
604
|
-
shm.buf[:
|
|
474
|
+
shm.buf[:] = b'\x00' * len(shm.buf)
|
|
475
|
+
shm.buf[:len(byte_data)] = byte_data
|
|
605
476
|
else:
|
|
606
477
|
@no_exception_func()
|
|
607
478
|
def write_bytes(shm, byte_data):
|
|
@@ -624,15 +495,13 @@ def worker_func(loop_flag, poll_interval, shm, cfg_path):
|
|
|
624
495
|
with open(cfg_path, 'r') as f:
|
|
625
496
|
data = json.load(f)
|
|
626
497
|
|
|
627
|
-
|
|
628
|
-
prof_args = DynamicProfilerArgs(**data)
|
|
629
|
-
prof_args.is_valid = True
|
|
498
|
+
data['is_valid'] = True
|
|
630
499
|
logger.info("Dynamic profiler process load json success")
|
|
631
500
|
except json.JSONDecodeError as e:
|
|
632
|
-
|
|
633
|
-
prof_args.is_valid = False
|
|
501
|
+
data = {'is_valid': False}
|
|
634
502
|
logger.error("Dynamic profiler process load json failed: %s", e)
|
|
635
|
-
|
|
503
|
+
# convert json to bytes
|
|
504
|
+
byte_data = DynamicProfilerConfigContext.json_to_bytes(data)
|
|
636
505
|
write_bytes(shm, byte_data)
|
|
637
506
|
else:
|
|
638
507
|
logger.error("Dynamic profiler cfg json not exists")
|
|
@@ -640,6 +509,36 @@ def worker_func(loop_flag, poll_interval, shm, cfg_path):
|
|
|
640
509
|
logger.info("Dynamic profiler process done")
|
|
641
510
|
|
|
642
511
|
|
|
512
|
+
@no_exception_func()
|
|
513
|
+
def worker_dyno_func(loop_flag, poll_interval, shm, rank_id):
|
|
514
|
+
""" dyno monitor process worker function python version >= 3.8"""
|
|
515
|
+
proxy = MsDynamicMonitorProxySingleton().get_proxy()
|
|
516
|
+
ret = proxy.init_dyno(rank_id)
|
|
517
|
+
|
|
518
|
+
if not ret:
|
|
519
|
+
logger.warning("Rank %d init dynolog failed !")
|
|
520
|
+
return
|
|
521
|
+
print_msg("Init dynolog success !")
|
|
522
|
+
|
|
523
|
+
while loop_flag.value:
|
|
524
|
+
try:
|
|
525
|
+
res = proxy.poll_dyno()
|
|
526
|
+
if not res:
|
|
527
|
+
continue
|
|
528
|
+
data = DynamicProfilerUtils.dyno_str_to_dict(res)
|
|
529
|
+
except Exception as e: # pylint: disable=broad-except
|
|
530
|
+
data = {'is_valid': False}
|
|
531
|
+
logger.error("Dynolog process load config failed: %s", e)
|
|
532
|
+
else:
|
|
533
|
+
data['is_valid'] = True
|
|
534
|
+
|
|
535
|
+
# convert dyno config json to bytes
|
|
536
|
+
byte_data = DynamicProfilerConfigContext.json_to_bytes(data)
|
|
537
|
+
write_bytes(shm, byte_data)
|
|
538
|
+
time.sleep(poll_interval)
|
|
539
|
+
logger.info("Dynolog process done")
|
|
540
|
+
|
|
541
|
+
|
|
643
542
|
if sys.version_info >= (3, 8):
|
|
644
543
|
from multiprocessing import shared_memory
|
|
645
544
|
from unittest.mock import patch
|
|
@@ -660,27 +559,46 @@ if sys.version_info >= (3, 8):
|
|
|
660
559
|
a relative value, with the first step of training being 1. The stop_step must be greater than or
|
|
661
560
|
equal to start_step. The default value is -1, indicating that data collection will not start during
|
|
662
561
|
the entire training process.
|
|
663
|
-
- aic_metrics (int, optional) -
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
562
|
+
- aic_metrics (int/str, optional) - Set the collection of AI Core metric data. The current version can
|
|
563
|
+
pass in either type int or str. Later, it will be updated to only pass in the str type.
|
|
564
|
+
Here, ``0`` and ``"PipeUtilization"`` represent PipeUtilization; ``1`` and ``"ArithmeticUtilization"``
|
|
565
|
+
represent ArithmeticUtilization; ``2`` and ``"Memory"`` represent Memory; ``3`` and ``"MemoryL0"``
|
|
566
|
+
represent MemoryL0; ``4`` and ``"MemoryUB"`` stand for MemoryUB; ``5`` and ``"ResourceConflictRatio"``
|
|
567
|
+
represent ResourceConflictRatio; ``6`` and ``"L2Cache"`` represent L2Cache; ``7`` and
|
|
568
|
+
``"MemoryAccess"`` stand for MemoryAccess. The default value ``"AiCoreNone"`` indicates that the
|
|
569
|
+
AI Core metric is not collected.
|
|
570
|
+
- profiler_level (int/str, optional) - Set the level for collecting performance data. The current
|
|
571
|
+
version can pass in either type int or str, and it will be updated to only pass in str type
|
|
572
|
+
in the future. Among them, ``-1`` and ``"LevelNone"`` represent ProfilerLevel.LevelNone, ``0``
|
|
573
|
+
and ``"Level0"`` represent ProfilerLevel.Level0, and ``1`` and ``"Level1"`` represent
|
|
574
|
+
ProfilerLevel.Level1. ``2`` and ``"Level2"`` stand for Profile Level.Level2.
|
|
575
|
+
The default value ``"Level0"`` indicates the collection level of ProfilerLevel.Level0.
|
|
576
|
+
- activities (int/list, optional) - Set the device for collecting performance data.
|
|
577
|
+
The current version can pass in either type int or list. Later, it will be updated to only
|
|
578
|
+
pass in the list type. Among them, ``0`` and ``["CPU","NPU"]`` represent CPU+NPU, ``1`` and
|
|
579
|
+
``["CPU"]`` represent CPU, and ``2`` and ``["NPU"]`` represent NPU. The default values
|
|
580
|
+
``["CPU","NPU"]`` indicate the collection of performance data of CPU+NPU.
|
|
581
|
+
- export_type (int/list, optional) - Set the type of the exported performance data.
|
|
582
|
+
The current version can pass in either type int or list, and it will be updated later
|
|
583
|
+
to only pass in the list type. Among them, ``0`` and ``["text"]`` represent text, ``1`` and ``["db"]``
|
|
584
|
+
represent db, and ``2`` and ``["text","db"]`` represent text and db respectively. The default value
|
|
585
|
+
``["text"]`` indicates that only performance data of the text type is exported.
|
|
676
586
|
- profile_memory (bool, optional) - Set whether to collect memory performance data, true indicates that
|
|
677
587
|
memory performance data is collected, false indicates that memory performance data is not collected.
|
|
678
588
|
The default value is false, indicating that memory performance data is not collected.
|
|
679
589
|
- mstx (bool, optional) - Set whether to enable mstx, true indicates that mstx is enabled, false
|
|
680
590
|
indicates that mstx is disabled. The default value is false, indicating that mstx is not enabled.
|
|
681
|
-
-
|
|
682
|
-
|
|
683
|
-
|
|
591
|
+
- analyse (bool, optional) - Set whether to enable online analysis. True indicates that online analysis
|
|
592
|
+
is enabled, while false indicates that online analysis is disabled. The default value is false,
|
|
593
|
+
indicating that online analysis is not enabled. This parameter has a higher priority than the
|
|
594
|
+
`analyse_mode` parameter. When this parameter is set to false, the setting of the `analyse_mode`
|
|
595
|
+
parameter does not take effect. When this parameter is set to true,
|
|
596
|
+
setting the `analyse_mode` parameter to -1 does not take effect.
|
|
597
|
+
- analyse_mode (int, optional) - Sets the mode for online analysis,
|
|
598
|
+
where 0 represents "sync" and 1 represents "async". The default value is -1,
|
|
599
|
+
indicating that online analysis is not used. This parameter has a lower priority than the `analyse`
|
|
600
|
+
parameter. When the `analyse` parameter is set to false, the setting of this parameter does not take
|
|
601
|
+
effect. When the `analyse` parameter is set to true, setting it to -1 does not take effect.
|
|
684
602
|
- parallel_strategy (bool, optional) - Sets whether to collect parallel strategy performance data,
|
|
685
603
|
where true means to collect and false means not to collect. The default value is false, indicating
|
|
686
604
|
that parallel strategy performance data is not collected.
|
|
@@ -690,6 +608,44 @@ if sys.version_info >= (3, 8):
|
|
|
690
608
|
- data_simplification (bool, optional) - Sets whether to enable data simplification, where true means
|
|
691
609
|
to enable and false means not to enable. The default value is true, indicating that data
|
|
692
610
|
simplification is enabled.
|
|
611
|
+
- record_shapes (bool, optional) - Sets whether to collect operator input tensor shapes data, where true
|
|
612
|
+
means that the shape data is collected and false means that the shape data is not collected. The
|
|
613
|
+
default value is false, indicating that input tensor shapes data is not collected.
|
|
614
|
+
- mstx_domain_include (list, optional) - Set the set of enabled domain names when the mstx switch
|
|
615
|
+
is turned on. The name must be of str type. Default value: ``[]``, indicating that this parameter
|
|
616
|
+
is not used to control the domain. This parameter is mutually exclusive with the mstx_domain_exclude
|
|
617
|
+
parameter and cannot be set. simultaneously. If both are set, only the mstx_domain_include parameter
|
|
618
|
+
takes effect.
|
|
619
|
+
- mstx_domain_exclude (list, optional) - Set the set of domain names that are not enabled when the
|
|
620
|
+
mstx switch is turned on. The name must be of str type. Default value: ``[]``, indicating that this
|
|
621
|
+
parameter is not used to control the domain.
|
|
622
|
+
- prof_path (str, optional) - Output data path of the dynamic profiler. It is the same as the interface
|
|
623
|
+
parameter `output_path`. When both are set, `prof_path` takes effect. Default value:
|
|
624
|
+
``"./"`` .
|
|
625
|
+
- sys_io (bool, optional) - Set whether to collect NIC and RoCE data. Default value: ``False`` ,
|
|
626
|
+
indicating that these data are not collected.
|
|
627
|
+
- sys_interconnection (bool, optional) - Set whether to collect system interconnection data,
|
|
628
|
+
including aggregate collective communication statistics (HCCS), PCIe data, and inter-chip transmission
|
|
629
|
+
bandwidth information. Default value: ``False`` , indicating that these data are not collected.
|
|
630
|
+
- host_sys (list, optional) - Collect the data of system class calls, storage classes and cpu usage
|
|
631
|
+
rate on the host side, and pass in the list type. It supports passing in one or more of ``"cpu"``,
|
|
632
|
+
``"mem"``, ``"disk"``, ``"network"`` and ``"osrt"``. Among them, ``"cpu"`` represents the cpu
|
|
633
|
+
utilization at the process level, ``"mem"`` represents the memory utilization at the process level,
|
|
634
|
+
``"disk"`` represents the disk I/O utilization at the process level, and ``"network"`` represents the
|
|
635
|
+
network I/O utilization at the system level. ``"osrt"`` represents system-level syscall and
|
|
636
|
+
pthreadcall. Default value: ``[]``, indicating that system class data on the host side is
|
|
637
|
+
not collected. When collecting DISK or OSRT data, it is necessary to install the iotop, perf,
|
|
638
|
+
and ltrace third-party tools in advance. For detailed steps, please refer to
|
|
639
|
+
`Installing Third-party Tools <https://www.hiascend.com/document/detail/zh/mindstudio/80RC1/T&ITools/
|
|
640
|
+
Profiling/atlasprofiling_16_0136.html>`_ .
|
|
641
|
+
After the third-party tool is successfully installed, user permissions need to be configured. For
|
|
642
|
+
detailed steps, please refer to `Configure User Permissions <https://www.hiascend.com/document/
|
|
643
|
+
detail/zh/mindstudio/80RC1/T&ITools/Profiling/atlasprofiling_16_0137.
|
|
644
|
+
html>`_ .
|
|
645
|
+
Note that in step 3 of configuring user permissions, the content in the msprof_data_collection.sh
|
|
646
|
+
script needs to be replaced with `msprof_data_collection.sh
|
|
647
|
+
<https://gitee.com/mindspore/mindspore/blob/master/docs/api/api_python/mindspore/script/
|
|
648
|
+
msprof_data_collection.sh>`_.
|
|
693
649
|
|
|
694
650
|
output_path (str, optional): (Ascend only) Output data path. Default: ``"./dyn_profile_data"`` .
|
|
695
651
|
poll_interval (int, optional): (Ascend only) The polling period of the monitoring process, in seconds.
|
|
@@ -729,9 +685,13 @@ if sys.version_info >= (3, 8):
|
|
|
729
685
|
... model.train(10, data, callbacks=[dynprof_cb])
|
|
730
686
|
"""
|
|
731
687
|
|
|
732
|
-
def __init__(self, cfg_path, output_path="./dyn_profile_data", poll_interval=2, **kwargs):
|
|
733
|
-
if
|
|
734
|
-
|
|
688
|
+
def __init__(self, cfg_path=None, output_path="./dyn_profile_data", poll_interval=2, **kwargs):
|
|
689
|
+
if DynamicProfilerUtils.is_dyno_mode() and cfg_path is not None:
|
|
690
|
+
logger.warning("If you export 'MSMONITOR_USE_DAEMON=1', your 'cfg_path' parameter will be invalid!")
|
|
691
|
+
cfg_path = None
|
|
692
|
+
|
|
693
|
+
if not DynamicProfilerUtils.is_dyno_mode() and not isinstance(cfg_path, str):
|
|
694
|
+
raise TypeError("If you set 'MSMONITOR_USE_DAEMON' to not 1, The cfg_path must be a string.")
|
|
735
695
|
if not isinstance(output_path, str):
|
|
736
696
|
logger.warning(f"The output_path must be a string, "
|
|
737
697
|
f"but got type {type(output_path)}, it will be set to './dyn_profile_data'.")
|
|
@@ -740,7 +700,21 @@ if sys.version_info >= (3, 8):
|
|
|
740
700
|
|
|
741
701
|
def _get_prof_args(self):
|
|
742
702
|
""" Get prof_args py38"""
|
|
743
|
-
|
|
703
|
+
byte_length = self._get_shm_byte_length()
|
|
704
|
+
|
|
705
|
+
if byte_length == 0:
|
|
706
|
+
return {}
|
|
707
|
+
|
|
708
|
+
valid_bytes = self._shm.buf[:byte_length]
|
|
709
|
+
return DynamicProfilerConfigContext.bytes_to_json(bytes(valid_bytes))
|
|
710
|
+
|
|
711
|
+
def _get_shm_byte_length(self):
|
|
712
|
+
byte_length = 0
|
|
713
|
+
for i, byte in enumerate(self._shm.buf):
|
|
714
|
+
if byte == 0:
|
|
715
|
+
byte_length = i
|
|
716
|
+
break
|
|
717
|
+
return byte_length
|
|
744
718
|
|
|
745
719
|
@no_exception_func()
|
|
746
720
|
def _clean_resource(self):
|
|
@@ -770,6 +744,12 @@ if sys.version_info >= (3, 8):
|
|
|
770
744
|
logger.warning("Rank %s unlink shm failed, may be removed", self._rank_id)
|
|
771
745
|
self._shm = None
|
|
772
746
|
|
|
747
|
+
@no_exception_func()
|
|
748
|
+
def _finalize_dynolog(self):
|
|
749
|
+
dyno_monitor_proxy = MsDynamicMonitorProxySingleton().get_proxy()
|
|
750
|
+
dyno_monitor_proxy.finalize_dyno()
|
|
751
|
+
logger.info("Rank %d finalize dynolog success !", self._rank_id)
|
|
752
|
+
|
|
773
753
|
@no_exception_func()
|
|
774
754
|
def _create_shm(self):
|
|
775
755
|
"""Create a json monitor process based on whether the SharedMemory is successfully created py38"""
|
|
@@ -789,7 +769,7 @@ if sys.version_info >= (3, 8):
|
|
|
789
769
|
try:
|
|
790
770
|
# Step 2: only one process can create shm successfully.
|
|
791
771
|
self._shm = shared_memory.SharedMemory(name=self._shm_name,
|
|
792
|
-
create=True, size=
|
|
772
|
+
create=True, size=DynamicProfilerUtils.CFG_BUFFER_SIZE)
|
|
793
773
|
self._is_create_process = True
|
|
794
774
|
logger.info("Rank %d shared memory is created.", self._rank_id)
|
|
795
775
|
break
|
|
@@ -799,7 +779,7 @@ if sys.version_info >= (3, 8):
|
|
|
799
779
|
logger.warning("Rank %d shared memory create failed, "
|
|
800
780
|
"retry times = %d.", self._rank_id, try_times)
|
|
801
781
|
time.sleep(random.uniform(0, 0.02)) # sleep 0 ~ 20 ms
|
|
802
|
-
except Exception as e:
|
|
782
|
+
except Exception as e: # pylint: disable=W0703
|
|
803
783
|
# shm open failed because of other process create shm not finished
|
|
804
784
|
try_times -= 1
|
|
805
785
|
logger.warning("Rank %d shared memory open failed, error: %s, retry times = %d",
|
|
@@ -858,16 +838,24 @@ else:
|
|
|
858
838
|
... model.train(10, data, callbacks=[dynprof_cb])
|
|
859
839
|
"""
|
|
860
840
|
|
|
861
|
-
def __init__(self, cfg_path, output_path="./dyn_profile_data", poll_interval=2, **kwargs):
|
|
862
|
-
if
|
|
863
|
-
|
|
841
|
+
def __init__(self, cfg_path=None, output_path="./dyn_profile_data", poll_interval=2, **kwargs):
|
|
842
|
+
if DynamicProfilerUtils.is_dyno_mode() and cfg_path is not None:
|
|
843
|
+
logger.warning("If you export 'MSMONITOR_USE_DAEMON=1', your 'cfg_path' parameter will be invalid!")
|
|
844
|
+
cfg_path = None
|
|
845
|
+
|
|
846
|
+
if not DynamicProfilerUtils.is_dyno_mode() and not isinstance(cfg_path, str):
|
|
847
|
+
raise TypeError("If you set 'MSMONITOR_USE_DAEMON' to not 1, The cfg_path must be a string.")
|
|
848
|
+
|
|
864
849
|
if not isinstance(output_path, str):
|
|
865
850
|
logger.warning(f"The output_path must be a string, "
|
|
866
851
|
f"but got type {type(output_path)}, it will be set to './dyn_profile_data'.")
|
|
867
852
|
output_path = "./dyn_profile_data"
|
|
868
853
|
self._cfg_path = cfg_path
|
|
869
854
|
self._shm_name = time.strftime("DynamicProfileShm%Y%m%d%H", time.localtime())
|
|
870
|
-
self._shm_dir =
|
|
855
|
+
self._shm_dir = (
|
|
856
|
+
"/dev/shm" if DynamicProfilerUtils.is_dyno_mode()
|
|
857
|
+
else os.path.join(self._cfg_path, "shm")
|
|
858
|
+
)
|
|
871
859
|
PathManager.make_dir_safety(self._shm_dir)
|
|
872
860
|
self._shm_path = os.path.realpath(os.path.join(self._shm_dir, self._shm_name))
|
|
873
861
|
|
|
@@ -878,7 +866,8 @@ else:
|
|
|
878
866
|
def _get_prof_args(self):
|
|
879
867
|
""" Get prof_args py37"""
|
|
880
868
|
self._shm.seek(0)
|
|
881
|
-
return
|
|
869
|
+
return DynamicProfilerConfigContext.bytes_to_json(
|
|
870
|
+
bytes(self._shm.read(DynamicProfilerUtils.CFG_BUFFER_SIZE)))
|
|
882
871
|
|
|
883
872
|
@no_exception_func()
|
|
884
873
|
def _clean_resource(self):
|
|
@@ -923,7 +912,8 @@ else:
|
|
|
923
912
|
self.fd = os.open(self._shm_path, os.O_EXCL | os.O_RDWR,
|
|
924
913
|
stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP)
|
|
925
914
|
self._memory_mapped_file = os.fdopen(self.fd, 'rb')
|
|
926
|
-
self._shm = mmap.mmap(self._memory_mapped_file.fileno(),
|
|
915
|
+
self._shm = mmap.mmap(self._memory_mapped_file.fileno(),
|
|
916
|
+
length=DynamicProfilerUtils.CFG_BUFFER_SIZE)
|
|
927
917
|
self._is_create_process = False
|
|
928
918
|
logger.info("Rank %d shared memory is connected.", self._rank_id)
|
|
929
919
|
break
|
|
@@ -937,7 +927,7 @@ else:
|
|
|
937
927
|
|
|
938
928
|
# Init mmap file need to write data
|
|
939
929
|
with os.fdopen(fd, 'wb') as f:
|
|
940
|
-
data_instance =
|
|
930
|
+
data_instance = DynamicProfilerConfigContext({})
|
|
941
931
|
byte_data = data_instance.to_bytes()
|
|
942
932
|
f.write(byte_data)
|
|
943
933
|
|
|
@@ -945,7 +935,8 @@ else:
|
|
|
945
935
|
self.fd = os.open(self._shm_path, os.O_EXCL | os.O_RDWR,
|
|
946
936
|
stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP)
|
|
947
937
|
self._memory_mapped_file = os.fdopen(self.fd, 'rb')
|
|
948
|
-
self._shm = mmap.mmap(self._memory_mapped_file.fileno(), length=
|
|
938
|
+
self._shm = mmap.mmap(self._memory_mapped_file.fileno(), length=DynamicProfilerUtils.
|
|
939
|
+
CFG_BUFFER_SIZE)
|
|
949
940
|
self._is_create_process = True
|
|
950
941
|
logger.info("Rank %d shared memory is created.", self._rank_id)
|
|
951
942
|
break
|