mindspore 2.3.0__cp39-cp39-win_amd64.whl → 2.4.0__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/__init__.py +3 -1
- mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +50 -9
- mindspore/_extends/parse/compile_config.py +41 -0
- mindspore/_extends/parse/parser.py +9 -7
- mindspore/_extends/parse/standard_method.py +52 -14
- mindspore/_extends/pijit/pijit_func_white_list.py +350 -24
- mindspore/amp.py +24 -10
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/common/__init__.py +6 -4
- mindspore/common/_pijit_context.py +190 -0
- mindspore/common/_register_for_tensor.py +2 -1
- mindspore/common/_tensor_overload.py +139 -0
- mindspore/common/api.py +102 -87
- mindspore/common/dump.py +5 -6
- mindspore/common/generator.py +1 -7
- mindspore/common/hook_handle.py +14 -26
- mindspore/common/mindir_util.py +2 -2
- mindspore/common/parameter.py +46 -13
- mindspore/common/recompute.py +39 -9
- mindspore/common/sparse_tensor.py +7 -3
- mindspore/common/tensor.py +209 -29
- mindspore/communication/__init__.py +1 -1
- mindspore/communication/_comm_helper.py +38 -3
- mindspore/communication/comm_func.py +310 -55
- mindspore/communication/management.py +14 -14
- mindspore/context.py +123 -22
- mindspore/dataset/__init__.py +1 -1
- mindspore/dataset/audio/__init__.py +1 -1
- mindspore/dataset/core/config.py +7 -0
- mindspore/dataset/core/validator_helpers.py +7 -0
- mindspore/dataset/engine/cache_client.py +1 -1
- mindspore/dataset/engine/datasets.py +72 -44
- mindspore/dataset/engine/datasets_audio.py +7 -7
- mindspore/dataset/engine/datasets_standard_format.py +53 -3
- mindspore/dataset/engine/datasets_text.py +20 -20
- mindspore/dataset/engine/datasets_user_defined.py +174 -104
- mindspore/dataset/engine/datasets_vision.py +33 -33
- mindspore/dataset/engine/iterators.py +29 -0
- mindspore/dataset/engine/obs/util.py +7 -0
- mindspore/dataset/engine/queue.py +114 -60
- mindspore/dataset/engine/serializer_deserializer.py +2 -2
- mindspore/dataset/engine/validators.py +34 -14
- mindspore/dataset/text/__init__.py +1 -4
- mindspore/dataset/transforms/__init__.py +0 -3
- mindspore/dataset/utils/line_reader.py +2 -0
- mindspore/dataset/vision/__init__.py +1 -4
- mindspore/dataset/vision/utils.py +1 -1
- mindspore/dataset/vision/validators.py +2 -1
- mindspore/dnnl.dll +0 -0
- mindspore/{nn/extend → experimental/es}/__init__.py +4 -11
- mindspore/experimental/es/embedding_service.py +883 -0
- mindspore/{nn/layer → experimental/es}/embedding_service_layer.py +218 -30
- mindspore/experimental/llm_boost/__init__.py +21 -0
- mindspore/{nn/extend/layer → experimental/llm_boost/atb}/__init__.py +4 -8
- mindspore/experimental/llm_boost/atb/boost_base.py +211 -0
- mindspore/experimental/llm_boost/atb/llama_boost.py +115 -0
- mindspore/experimental/llm_boost/atb/qwen_boost.py +101 -0
- mindspore/experimental/llm_boost/register.py +129 -0
- mindspore/experimental/llm_boost/utils.py +31 -0
- mindspore/experimental/optim/adamw.py +85 -0
- mindspore/experimental/optim/optimizer.py +3 -0
- mindspore/hal/__init__.py +3 -3
- mindspore/hal/contiguous_tensors_handle.py +175 -0
- mindspore/hal/stream.py +18 -0
- mindspore/include/api/model_group.h +13 -1
- mindspore/include/api/types.h +10 -10
- mindspore/include/dataset/config.h +2 -2
- mindspore/include/dataset/constants.h +2 -2
- mindspore/include/dataset/execute.h +2 -2
- mindspore/include/dataset/vision.h +4 -0
- mindspore/jpeg62.dll +0 -0
- mindspore/log.py +1 -1
- mindspore/mindrecord/filewriter.py +68 -51
- mindspore/mindspore_backend.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_np_dtype.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/mint/__init__.py +495 -46
- mindspore/mint/distributed/__init__.py +31 -0
- mindspore/mint/distributed/distributed.py +254 -0
- mindspore/mint/nn/__init__.py +266 -21
- mindspore/mint/nn/functional.py +125 -19
- mindspore/mint/nn/layer/__init__.py +39 -0
- mindspore/mint/nn/layer/activation.py +133 -0
- mindspore/mint/nn/layer/normalization.py +477 -0
- mindspore/mint/nn/layer/pooling.py +110 -0
- mindspore/mint/optim/adamw.py +28 -7
- mindspore/mint/special/__init__.py +63 -0
- mindspore/multiprocessing/__init__.py +2 -1
- mindspore/nn/__init__.py +0 -1
- mindspore/nn/cell.py +275 -93
- mindspore/nn/layer/activation.py +211 -44
- mindspore/nn/layer/basic.py +113 -3
- mindspore/nn/layer/embedding.py +120 -2
- mindspore/nn/layer/normalization.py +101 -5
- mindspore/nn/layer/padding.py +34 -48
- mindspore/nn/layer/pooling.py +161 -7
- mindspore/nn/layer/transformer.py +3 -3
- mindspore/nn/loss/__init__.py +2 -2
- mindspore/nn/loss/loss.py +84 -6
- mindspore/nn/optim/__init__.py +2 -1
- mindspore/nn/optim/adadelta.py +1 -1
- mindspore/nn/optim/adam.py +1 -1
- mindspore/nn/optim/lamb.py +1 -1
- mindspore/nn/optim/tft_wrapper.py +127 -0
- mindspore/nn/wrap/cell_wrapper.py +12 -23
- mindspore/nn/wrap/grad_reducer.py +5 -5
- mindspore/nn/wrap/loss_scale.py +17 -3
- mindspore/numpy/__init__.py +1 -1
- mindspore/numpy/array_creations.py +65 -68
- mindspore/numpy/array_ops.py +64 -60
- mindspore/numpy/fft.py +610 -75
- mindspore/numpy/logic_ops.py +11 -10
- mindspore/numpy/math_ops.py +85 -84
- mindspore/numpy/utils_const.py +4 -4
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/__init__.py +6 -4
- mindspore/ops/_grad_experimental/grad_comm_ops.py +47 -3
- mindspore/ops/_grad_experimental/grad_math_ops.py +0 -22
- mindspore/ops/_vmap/vmap_array_ops.py +2 -4
- mindspore/ops/_vmap/vmap_math_ops.py +17 -1
- mindspore/ops/_vmap/vmap_nn_ops.py +43 -2
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +85 -7
- mindspore/ops/auto_generate/gen_arg_dtype_cast.py +2 -0
- mindspore/ops/auto_generate/gen_extend_func.py +734 -13
- mindspore/ops/auto_generate/gen_ops_def.py +2420 -381
- mindspore/ops/auto_generate/gen_ops_prim.py +5196 -1659
- mindspore/ops/auto_generate/pyboost_inner_prim.py +176 -56
- mindspore/ops/composite/base.py +85 -48
- mindspore/ops/composite/multitype_ops/_compile_utils.py +1 -0
- mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -2
- mindspore/ops/function/__init__.py +22 -0
- mindspore/ops/function/array_func.py +490 -153
- mindspore/ops/function/debug_func.py +113 -1
- mindspore/ops/function/fft_func.py +15 -2
- mindspore/ops/function/grad/grad_func.py +3 -2
- mindspore/ops/function/math_func.py +558 -207
- mindspore/ops/function/nn_func.py +817 -383
- mindspore/ops/function/other_func.py +3 -2
- mindspore/ops/function/random_func.py +184 -8
- mindspore/ops/function/reshard_func.py +13 -11
- mindspore/ops/function/sparse_unary_func.py +1 -1
- mindspore/ops/function/vmap_func.py +3 -2
- mindspore/ops/functional.py +24 -14
- mindspore/ops/op_info_register.py +3 -3
- mindspore/ops/operations/__init__.py +6 -1
- mindspore/ops/operations/_grad_ops.py +2 -76
- mindspore/ops/operations/_infer_ops.py +1 -1
- mindspore/ops/operations/_inner_ops.py +71 -94
- mindspore/ops/operations/array_ops.py +12 -146
- mindspore/ops/operations/comm_ops.py +42 -53
- mindspore/ops/operations/custom_ops.py +83 -19
- mindspore/ops/operations/debug_ops.py +42 -10
- mindspore/ops/operations/manually_defined/_inner.py +12 -0
- mindspore/ops/operations/manually_defined/ops_def.py +265 -10
- mindspore/ops/operations/math_ops.py +12 -223
- mindspore/ops/operations/nn_ops.py +20 -114
- mindspore/ops/operations/other_ops.py +7 -4
- mindspore/ops/operations/random_ops.py +46 -1
- mindspore/ops/primitive.py +18 -6
- mindspore/ops_generate/arg_dtype_cast.py +2 -0
- mindspore/ops_generate/gen_aclnn_implement.py +11 -11
- mindspore/ops_generate/gen_constants.py +36 -0
- mindspore/ops_generate/gen_ops.py +67 -52
- mindspore/ops_generate/gen_ops_inner_prim.py +1 -1
- mindspore/ops_generate/gen_pyboost_func.py +131 -47
- mindspore/ops_generate/op_proto.py +10 -3
- mindspore/ops_generate/pyboost_utils.py +14 -1
- mindspore/ops_generate/template.py +43 -21
- mindspore/parallel/__init__.py +3 -1
- mindspore/parallel/_auto_parallel_context.py +28 -8
- mindspore/parallel/_cell_wrapper.py +83 -0
- mindspore/parallel/_parallel_serialization.py +47 -19
- mindspore/parallel/_tensor.py +81 -11
- mindspore/parallel/_utils.py +13 -1
- mindspore/parallel/algo_parameter_config.py +5 -5
- mindspore/parallel/checkpoint_transform.py +46 -39
- mindspore/parallel/cluster/process_entity/__init__.py +1 -1
- mindspore/parallel/cluster/process_entity/_api.py +31 -23
- mindspore/parallel/cluster/process_entity/_utils.py +2 -27
- mindspore/parallel/parameter_broadcast.py +3 -4
- mindspore/parallel/shard.py +162 -31
- mindspore/parallel/transform_safetensors.py +993 -0
- mindspore/profiler/__init__.py +2 -1
- mindspore/profiler/common/constant.py +29 -0
- mindspore/profiler/common/registry.py +47 -0
- mindspore/profiler/common/util.py +28 -0
- mindspore/profiler/dynamic_profiler.py +694 -0
- mindspore/profiler/envprofiling.py +17 -19
- mindspore/profiler/parser/ascend_analysis/constant.py +18 -0
- mindspore/profiler/parser/ascend_analysis/file_manager.py +25 -4
- mindspore/profiler/parser/ascend_analysis/function_event.py +43 -19
- mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +31 -26
- mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +56 -10
- mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +55 -8
- mindspore/profiler/parser/ascend_analysis/path_manager.py +313 -0
- mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +27 -20
- mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +9 -2
- mindspore/profiler/parser/ascend_msprof_exporter.py +5 -4
- mindspore/profiler/parser/ascend_timeline_generator.py +27 -25
- mindspore/profiler/parser/base_timeline_generator.py +19 -25
- mindspore/profiler/parser/cpu_gpu_timeline_generator.py +25 -12
- mindspore/profiler/parser/framework_parser.py +1 -391
- mindspore/profiler/parser/gpu_analysis/__init__.py +14 -0
- mindspore/profiler/parser/gpu_analysis/function_event.py +44 -0
- mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +89 -0
- mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +72 -0
- mindspore/profiler/parser/memory_usage_parser.py +0 -154
- mindspore/profiler/parser/profiler_info.py +78 -6
- mindspore/profiler/profiler.py +153 -0
- mindspore/profiler/profiling.py +280 -412
- mindspore/rewrite/__init__.py +1 -2
- mindspore/rewrite/common/namespace.py +4 -4
- mindspore/rewrite/symbol_tree/symbol_tree.py +3 -3
- mindspore/run_check/_check_version.py +36 -103
- mindspore/safeguard/rewrite_obfuscation.py +591 -247
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/__init__.py +4 -3
- mindspore/train/_utils.py +28 -2
- mindspore/train/amp.py +171 -53
- mindspore/train/callback/__init__.py +2 -2
- mindspore/train/callback/_callback.py +4 -4
- mindspore/train/callback/_checkpoint.py +85 -22
- mindspore/train/callback/_cluster_monitor.py +1 -1
- mindspore/train/callback/_flops_collector.py +1 -0
- mindspore/train/callback/_loss_monitor.py +3 -3
- mindspore/train/callback/_on_request_exit.py +134 -31
- mindspore/train/callback/_summary_collector.py +5 -5
- mindspore/train/callback/_tft_register.py +352 -0
- mindspore/train/dataset_helper.py +7 -3
- mindspore/train/metrics/metric.py +3 -3
- mindspore/train/metrics/roc.py +4 -4
- mindspore/train/mind_ir_pb2.py +44 -39
- mindspore/train/model.py +134 -58
- mindspore/train/serialization.py +336 -112
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +21 -0
- mindspore/utils/utils.py +60 -0
- mindspore/version.py +1 -1
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/METADATA +6 -2
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/RECORD +258 -252
- mindspore/include/c_api/ms/abstract.h +0 -67
- mindspore/include/c_api/ms/attribute.h +0 -197
- mindspore/include/c_api/ms/base/handle_types.h +0 -43
- mindspore/include/c_api/ms/base/macros.h +0 -32
- mindspore/include/c_api/ms/base/status.h +0 -33
- mindspore/include/c_api/ms/base/types.h +0 -283
- mindspore/include/c_api/ms/context.h +0 -102
- mindspore/include/c_api/ms/graph.h +0 -160
- mindspore/include/c_api/ms/node.h +0 -606
- mindspore/include/c_api/ms/tensor.h +0 -161
- mindspore/include/c_api/ms/value.h +0 -84
- mindspore/mindspore_shared_lib.dll +0 -0
- mindspore/nn/extend/basic.py +0 -140
- mindspore/nn/extend/embedding.py +0 -143
- mindspore/nn/extend/layer/normalization.py +0 -109
- mindspore/nn/extend/pooling.py +0 -117
- mindspore/nn/layer/embedding_service.py +0 -531
- mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +0 -93
- mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +0 -66
- mindspore/ops/extend/__init__.py +0 -53
- mindspore/ops/extend/array_func.py +0 -218
- mindspore/ops/extend/math_func.py +0 -76
- mindspore/ops/extend/nn_func.py +0 -308
- mindspore/ops/silent_check.py +0 -162
- mindspore/profiler/parser/msadvisor_analyzer.py +0 -82
- mindspore/profiler/parser/msadvisor_parser.py +0 -240
- mindspore/train/callback/_mindio_ttp.py +0 -443
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/WHEEL +0 -0
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/entry_points.txt +0 -0
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,694 @@
|
|
|
1
|
+
# Copyright 2022-2023 Huawei Technologies Co., Ltd
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ============================================================================
|
|
15
|
+
"""Dynamic Profile Monitor"""
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
import time
|
|
19
|
+
import json
|
|
20
|
+
import atexit
|
|
21
|
+
import struct
|
|
22
|
+
import random
|
|
23
|
+
import multiprocessing
|
|
24
|
+
|
|
25
|
+
from mindspore import log as logger
|
|
26
|
+
from mindspore.train import Callback
|
|
27
|
+
from mindspore.profiler import Profiler
|
|
28
|
+
from mindspore.profiler import ProfilerLevel
|
|
29
|
+
from mindspore.communication import get_rank
|
|
30
|
+
from mindspore.profiler.parser.ascend_analysis.file_manager import FileManager
|
|
31
|
+
from mindspore.profiler.parser.ascend_analysis.path_manager import PathManager
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_real_rank():
|
|
35
|
+
"""get rank id"""
|
|
36
|
+
try:
|
|
37
|
+
return get_rank()
|
|
38
|
+
except RuntimeError:
|
|
39
|
+
return int(os.getenv("RANK_ID", "0"))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def print_msg(msg):
|
|
43
|
+
"""print msg"""
|
|
44
|
+
print("[Dynamic Profiler] " + msg, flush=True)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class DynamicProfilerArgs:
|
|
48
|
+
"""
|
|
49
|
+
Data class for dynamic profile config.
|
|
50
|
+
"""
|
|
51
|
+
FMT = "iiiiii?????"
|
|
52
|
+
SIZE = struct.calcsize(FMT)
|
|
53
|
+
|
|
54
|
+
def __init__(self,
|
|
55
|
+
start_step: int = -1,
|
|
56
|
+
stop_step: int = -1,
|
|
57
|
+
aicore_metrics: int = -1,
|
|
58
|
+
profiler_level: int = -1,
|
|
59
|
+
profile_framework: int = -1,
|
|
60
|
+
analyse_mode: int = -1,
|
|
61
|
+
profile_communication: bool = False,
|
|
62
|
+
parallel_strategy: bool = False,
|
|
63
|
+
with_stack: bool = False,
|
|
64
|
+
data_simplification: bool = True,
|
|
65
|
+
is_valid: bool = False,
|
|
66
|
+
**kwargs):
|
|
67
|
+
self._start_step = start_step
|
|
68
|
+
self._stop_step = stop_step
|
|
69
|
+
self._aicore_metrics = aicore_metrics
|
|
70
|
+
self._profiler_level = profiler_level
|
|
71
|
+
self._profile_framework = profile_framework
|
|
72
|
+
self._analyse_mode = analyse_mode
|
|
73
|
+
self._profile_communication = profile_communication
|
|
74
|
+
self._parallel_strategy = parallel_strategy
|
|
75
|
+
self._with_stack = with_stack
|
|
76
|
+
self._data_simplification = data_simplification
|
|
77
|
+
self._is_valid = is_valid
|
|
78
|
+
self._check_params_type()
|
|
79
|
+
|
|
80
|
+
def _check_params_type(self):
|
|
81
|
+
""" check params type."""
|
|
82
|
+
if not isinstance(self._start_step, int):
|
|
83
|
+
logger.warning("start_step should be int type, start_step will be reset to -1.")
|
|
84
|
+
self._start_step = -1
|
|
85
|
+
|
|
86
|
+
if not isinstance(self._stop_step, int):
|
|
87
|
+
logger.warning("stop_step should be int type, stop_step will be reset to -1.")
|
|
88
|
+
self._stop_step = -1
|
|
89
|
+
|
|
90
|
+
if not isinstance(self._aicore_metrics, int):
|
|
91
|
+
logger.warning("aicore_metrics should be int type, aicore_metrics will be reset to -1.")
|
|
92
|
+
self._aicore_metrics = -1
|
|
93
|
+
|
|
94
|
+
if not isinstance(self._profiler_level, int):
|
|
95
|
+
logger.warning("profiler_level should be int type, profiler_level will be reset to -1.")
|
|
96
|
+
self._profiler_level = -1
|
|
97
|
+
|
|
98
|
+
if not isinstance(self._profile_framework, int):
|
|
99
|
+
logger.warning("profile_framework should be int type, profile_framework will be reset to -1.")
|
|
100
|
+
self._profile_framework = -1
|
|
101
|
+
|
|
102
|
+
if not isinstance(self._analyse_mode, int):
|
|
103
|
+
logger.warning("analyse_mode should be int type, analyse_mode will be reset to -1.")
|
|
104
|
+
self._analyse_mode = -1
|
|
105
|
+
|
|
106
|
+
if not isinstance(self._profile_communication, bool):
|
|
107
|
+
logger.warning("profile_communication should be bool type, profile_communication will be reset to False.")
|
|
108
|
+
self._profile_communication = False
|
|
109
|
+
|
|
110
|
+
if not isinstance(self._parallel_strategy, bool):
|
|
111
|
+
logger.warning("parallel_strategy should be bool type, parallel_strategy will be reset to False.")
|
|
112
|
+
self._parallel_strategy = False
|
|
113
|
+
|
|
114
|
+
if not isinstance(self._with_stack, bool):
|
|
115
|
+
logger.warning("with_stack should be bool type, with_stack will be reset to False.")
|
|
116
|
+
self._with_stack = False
|
|
117
|
+
|
|
118
|
+
if not isinstance(self._data_simplification, bool):
|
|
119
|
+
logger.warning("data_simplification should be bool type, data_simplification will be reset to True.")
|
|
120
|
+
self._data_simplification = True
|
|
121
|
+
|
|
122
|
+
if not isinstance(self._is_valid, bool):
|
|
123
|
+
logger.warning("is_valid should be bool type, is_valid will be reset to False.")
|
|
124
|
+
self._is_valid = False
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def start_step(self):
|
|
128
|
+
""" get start step value."""
|
|
129
|
+
return self._start_step
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def stop_step(self):
|
|
133
|
+
""" get stop step value."""
|
|
134
|
+
return self._stop_step
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def is_valid(self):
|
|
138
|
+
""" get json valid value."""
|
|
139
|
+
return self._is_valid
|
|
140
|
+
|
|
141
|
+
@is_valid.setter
|
|
142
|
+
def is_valid(self, value):
|
|
143
|
+
""" set json valid value."""
|
|
144
|
+
self._is_valid = value
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def analyse_mode(self):
|
|
148
|
+
""" get analyse mode value."""
|
|
149
|
+
return self._convert_analyse_mode(self._analyse_mode)
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def vars(self):
|
|
153
|
+
""" get all values in DynamicProfilerArgs."""
|
|
154
|
+
not_supported_args = ['_is_valid']
|
|
155
|
+
res = {}
|
|
156
|
+
for key, value in self.__dict__.items():
|
|
157
|
+
if key not in not_supported_args:
|
|
158
|
+
res[key.replace('_', '', 1)] = value
|
|
159
|
+
return res
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def args(self):
|
|
163
|
+
""" get all args in DynamicProfilerArgs."""
|
|
164
|
+
self._profiler_level = self._convert_profiler_level(self._profiler_level)
|
|
165
|
+
self._profile_framework = self._convert_profile_framework(self._profile_framework)
|
|
166
|
+
not_supported_args = ['_start_step', '_stop_step', '_analyse_mode', '_is_valid']
|
|
167
|
+
res = {}
|
|
168
|
+
for key, value in self.__dict__.items():
|
|
169
|
+
if key not in not_supported_args:
|
|
170
|
+
res[key.replace('_', '', 1)] = value
|
|
171
|
+
return res
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def from_bytes(cls, byte_data):
|
|
175
|
+
""" unpack bytes to DynamicProfilerArgs."""
|
|
176
|
+
unpacked = struct.unpack(cls.FMT, byte_data)
|
|
177
|
+
return cls(*unpacked)
|
|
178
|
+
|
|
179
|
+
def to_bytes(self):
|
|
180
|
+
""" pack DynamicProfilerArgs to bytes."""
|
|
181
|
+
instance_vars = tuple(self.__dict__.values())
|
|
182
|
+
if len(instance_vars) != len(self.FMT):
|
|
183
|
+
raise ValueError("Number of variables does not match format string.")
|
|
184
|
+
return struct.pack(DynamicProfilerArgs.FMT, *instance_vars)
|
|
185
|
+
|
|
186
|
+
def _convert_analyse_mode(self, analyse_mode: int) -> str:
|
|
187
|
+
""" convert analyse_mode to real args in Profiler."""
|
|
188
|
+
if analyse_mode == 0:
|
|
189
|
+
return 'sync'
|
|
190
|
+
if analyse_mode == 1:
|
|
191
|
+
return 'async'
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
def _convert_profiler_level(self, profiler_level: int) -> ProfilerLevel:
|
|
195
|
+
""" convert profiler_level to real args in Profiler."""
|
|
196
|
+
if profiler_level == 0:
|
|
197
|
+
return ProfilerLevel.Level0
|
|
198
|
+
if profiler_level == 1:
|
|
199
|
+
return ProfilerLevel.Level1
|
|
200
|
+
if profiler_level == 2:
|
|
201
|
+
return ProfilerLevel.Level2
|
|
202
|
+
return None
|
|
203
|
+
|
|
204
|
+
def _convert_profile_framework(self, profile_framework: int) -> str:
|
|
205
|
+
""" convert profile_framework to real args in Profiler."""
|
|
206
|
+
if profile_framework == 0:
|
|
207
|
+
return "time"
|
|
208
|
+
if profile_framework == 1:
|
|
209
|
+
return "all"
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class DynamicProfilerMonitorBase(Callback):
|
|
214
|
+
"""
|
|
215
|
+
Dynamic profile callback base class implementing the dynamic profile functionality.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
def __init__(self, cfg_path, output_path=None, poll_interval=2, **kwargs):
|
|
219
|
+
self._cfg_path = cfg_path
|
|
220
|
+
self._cfg_json_path = os.path.join(self._cfg_path, "profiler_config.json")
|
|
221
|
+
self._cfg_json_path = os.path.realpath(self._cfg_json_path)
|
|
222
|
+
self._output_path = "dyn_profile_data" if output_path is None else output_path
|
|
223
|
+
self._poll_interval = poll_interval
|
|
224
|
+
if not isinstance(self._poll_interval, int):
|
|
225
|
+
logger.error("Poll interval must be an integer, reset to 2.")
|
|
226
|
+
self._poll_interval = 2
|
|
227
|
+
|
|
228
|
+
if self._poll_interval < 1:
|
|
229
|
+
logger.error("Poll interval must be greater than 1, reset to 2.")
|
|
230
|
+
self._poll_interval = 2
|
|
231
|
+
|
|
232
|
+
self._kwargs = kwargs
|
|
233
|
+
self._shm_name = time.strftime("DynamicProfileShm%Y%m%d%H", time.localtime())
|
|
234
|
+
self._rank_id = get_real_rank()
|
|
235
|
+
self._shared_loop_flag = multiprocessing.Value('b', True)
|
|
236
|
+
self._shm = None
|
|
237
|
+
self._process = None
|
|
238
|
+
self._profiler = None
|
|
239
|
+
self._last_start_step = None
|
|
240
|
+
self._last_stop_step = None
|
|
241
|
+
self._is_create_process = None
|
|
242
|
+
self._is_started = False
|
|
243
|
+
|
|
244
|
+
self._init_cfg_json()
|
|
245
|
+
self._create_shm()
|
|
246
|
+
self._create_process()
|
|
247
|
+
atexit.register(self._clean_resource)
|
|
248
|
+
|
|
249
|
+
def step_begin(self, run_context):
|
|
250
|
+
"""
|
|
251
|
+
Start profile at the begin of step.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
run_context (RunContext): Context of the train running.
|
|
255
|
+
"""
|
|
256
|
+
prof_args = self._get_prof_args()
|
|
257
|
+
|
|
258
|
+
if not prof_args.is_valid:
|
|
259
|
+
logger.error("Dynamic profile json is not valid, please check the json file.")
|
|
260
|
+
return
|
|
261
|
+
|
|
262
|
+
if prof_args.start_step == -1 or prof_args.start_step == self._last_start_step:
|
|
263
|
+
return
|
|
264
|
+
|
|
265
|
+
cb_params = run_context.original_args()
|
|
266
|
+
step_num = cb_params.cur_step_num
|
|
267
|
+
start_step, stop_step = self._check_step(prof_args.start_step, prof_args.stop_step, step_num)
|
|
268
|
+
|
|
269
|
+
# Prevent repeated calls of the start function within a complete interval
|
|
270
|
+
if step_num == start_step:
|
|
271
|
+
if self._is_started:
|
|
272
|
+
logger.error("Dynamic profile is already started at step %d, "
|
|
273
|
+
"please wait the first profile finished at step %d.",
|
|
274
|
+
self._last_start_step, self._last_stop_step)
|
|
275
|
+
return
|
|
276
|
+
|
|
277
|
+
if self._profiler is None:
|
|
278
|
+
prof_path = os.path.join(self._output_path, f"rank{self._rank_id}_start{start_step}_stop{stop_step}")
|
|
279
|
+
PathManager.check_input_directory_path(prof_path)
|
|
280
|
+
self._profiler = Profiler(output_path=prof_path, start_profile=False, **prof_args.args)
|
|
281
|
+
print_msg(f"Rank {self._rank_id} create output path {prof_path}")
|
|
282
|
+
|
|
283
|
+
self._profiler.start()
|
|
284
|
+
self._is_started = True
|
|
285
|
+
self._last_start_step = start_step
|
|
286
|
+
self._last_stop_step = stop_step
|
|
287
|
+
print_msg(f"Rank {self._rank_id} Dynamic profiler start at step {start_step}, "
|
|
288
|
+
f"will stop at step {stop_step}")
|
|
289
|
+
|
|
290
|
+
def step_end(self, run_context):
|
|
291
|
+
"""
|
|
292
|
+
Stop profile at the end of step.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
run_context (RunContext): Context of the train running.
|
|
296
|
+
"""
|
|
297
|
+
prof_args = self._get_prof_args()
|
|
298
|
+
|
|
299
|
+
if not prof_args.is_valid:
|
|
300
|
+
logger.error("Dynamic profile json is not valid, please check the json file.")
|
|
301
|
+
return
|
|
302
|
+
|
|
303
|
+
if prof_args.stop_step == -1:
|
|
304
|
+
return
|
|
305
|
+
|
|
306
|
+
cb_params = run_context.original_args()
|
|
307
|
+
step_num = cb_params.cur_step_num
|
|
308
|
+
|
|
309
|
+
if step_num == self._last_stop_step and self._is_started:
|
|
310
|
+
if self._profiler:
|
|
311
|
+
self._profiler.stop()
|
|
312
|
+
if prof_args.analyse_mode:
|
|
313
|
+
self._profiler.analyse(mode=prof_args.analyse_mode)
|
|
314
|
+
else:
|
|
315
|
+
self._profiler._ascend_profiler.finalize()
|
|
316
|
+
self._profiler = None
|
|
317
|
+
self._is_started = False
|
|
318
|
+
print_msg(f"Rank {self._rank_id} Dynamic profiler stop at step {step_num}")
|
|
319
|
+
|
|
320
|
+
def on_train_end(self, run_context):
|
|
321
|
+
"""
|
|
322
|
+
Callback on trian end
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
run_context (RunContext): Context of the train running.
|
|
326
|
+
"""
|
|
327
|
+
self._clean_resource()
|
|
328
|
+
|
|
329
|
+
def _get_prof_args(self):
|
|
330
|
+
""" Get prof_args """
|
|
331
|
+
logger.error("Dynamic profiler _get_prof_args is not implemented")
|
|
332
|
+
return DynamicProfilerArgs()
|
|
333
|
+
|
|
334
|
+
def _clean_resource(self):
|
|
335
|
+
"""Clean resource"""
|
|
336
|
+
logger.error("Dynamic profiler _clean_resource is not implemented")
|
|
337
|
+
|
|
338
|
+
def _check_step(self, start_step, stop_step, step_num):
|
|
339
|
+
"""Check step valid"""
|
|
340
|
+
if start_step <= 0 or stop_step <= 0:
|
|
341
|
+
return -1, -1
|
|
342
|
+
|
|
343
|
+
if start_step > stop_step:
|
|
344
|
+
logger.error("start_step must be less than stop_step, "
|
|
345
|
+
"but get start_step = %d, stop_step = %d", start_step, stop_step)
|
|
346
|
+
return -1, -1
|
|
347
|
+
|
|
348
|
+
if start_step < step_num and start_step != self._last_start_step:
|
|
349
|
+
logger.error("start_step must be greater than step_num, "
|
|
350
|
+
"but get start_step = %d, stop_step = %d, step_num = %d", start_step, stop_step, step_num)
|
|
351
|
+
return -1, -1
|
|
352
|
+
|
|
353
|
+
if stop_step < step_num and stop_step != self._last_stop_step:
|
|
354
|
+
logger.error("stop_step must be greater than step_num, "
|
|
355
|
+
"but get start_step = %d, stop_step = %d, step_num = %d", start_step, stop_step, step_num)
|
|
356
|
+
return -1, -1
|
|
357
|
+
|
|
358
|
+
return start_step, stop_step
|
|
359
|
+
|
|
360
|
+
def _init_cfg_json(self):
|
|
361
|
+
"""Init config json file"""
|
|
362
|
+
if self._rank_id == 0:
|
|
363
|
+
if not os.path.exists(self._cfg_json_path):
|
|
364
|
+
logger.warning("cfg_path is not exist, create default cfg json")
|
|
365
|
+
FileManager.create_json_file(self._cfg_path, DynamicProfilerArgs().vars,
|
|
366
|
+
"profiler_config.json", indent=4)
|
|
367
|
+
else:
|
|
368
|
+
logger.info("rank_id is not 0, skip init cfg json")
|
|
369
|
+
print_msg(f"Init config json file: {self._cfg_json_path}")
|
|
370
|
+
|
|
371
|
+
def _create_shm(self):
|
|
372
|
+
"""Create a json monitor process based on whether the SharedMemory is successfully created"""
|
|
373
|
+
logger.error("Dynamic profiler _create_shm is not implemented")
|
|
374
|
+
|
|
375
|
+
def _create_process(self):
|
|
376
|
+
"""Create json monitor process, one process will be created at one worker"""
|
|
377
|
+
if self._is_create_process:
|
|
378
|
+
# daemon need to be set to True, otherwise the process will not be killed when the main process exits.
|
|
379
|
+
self._process = multiprocessing.Process(target=worker_func, daemon=True,
|
|
380
|
+
args=(self._shared_loop_flag, self._poll_interval,
|
|
381
|
+
self._shm, self._cfg_json_path))
|
|
382
|
+
self._process.start()
|
|
383
|
+
logger.info("Config monitor process has been created by rank %d.", self._rank_id)
|
|
384
|
+
else:
|
|
385
|
+
self._process = None
|
|
386
|
+
logger.info("Rank %d no need to create process.", self._rank_id)
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
if sys.version_info >= (3, 8):
|
|
390
|
+
def write_bytes(shm, byte_data):
|
|
391
|
+
"""Write bytes to shared memory"""
|
|
392
|
+
shm.buf[:DynamicProfilerArgs.SIZE] = byte_data
|
|
393
|
+
else:
|
|
394
|
+
def write_bytes(shm, byte_data):
|
|
395
|
+
"""Write bytes to shared memory"""
|
|
396
|
+
shm.seek(0)
|
|
397
|
+
shm.write(byte_data)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def worker_func(loop_flag, poll_interval, shm, cfg_path):
|
|
401
|
+
""" Json monitor process worker function python version >= 3.8"""
|
|
402
|
+
last_file_t = None
|
|
403
|
+
while loop_flag.value:
|
|
404
|
+
if os.path.exists(cfg_path):
|
|
405
|
+
file_t = os.path.getmtime(cfg_path)
|
|
406
|
+
if not last_file_t or last_file_t != file_t:
|
|
407
|
+
last_file_t = file_t
|
|
408
|
+
|
|
409
|
+
try:
|
|
410
|
+
with open(cfg_path, 'r') as f:
|
|
411
|
+
data = json.load(f)
|
|
412
|
+
|
|
413
|
+
# convert json to DynamicProfilerArgs
|
|
414
|
+
prof_args = DynamicProfilerArgs(**data)
|
|
415
|
+
prof_args.is_valid = True
|
|
416
|
+
logger.info("Dynamic profiler process load json success")
|
|
417
|
+
except json.JSONDecodeError as e:
|
|
418
|
+
prof_args = DynamicProfilerArgs()
|
|
419
|
+
prof_args.is_valid = False
|
|
420
|
+
logger.error("Dynamic profiler process load json failed: %s", e)
|
|
421
|
+
byte_data = prof_args.to_bytes()
|
|
422
|
+
write_bytes(shm, byte_data)
|
|
423
|
+
else:
|
|
424
|
+
logger.error("Dynamic profiler cfg json not exists")
|
|
425
|
+
time.sleep(poll_interval)
|
|
426
|
+
logger.info("Dynamic profiler process done")
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
if sys.version_info >= (3, 8):
|
|
430
|
+
from multiprocessing import shared_memory
|
|
431
|
+
from unittest.mock import patch
|
|
432
|
+
|
|
433
|
+
class DynamicProfilerMonitor(DynamicProfilerMonitorBase):
|
|
434
|
+
r"""
|
|
435
|
+
This class to enable the dynamic profile monitoring of MindSpore neural networks.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
cfg_path (str): Dynamic profile json config file directory. The requirement is a shared path
|
|
439
|
+
that can be accessed by all nodes.
|
|
440
|
+
output_path (str, optional): Output data path. Default: ``"./dyn_profile_data"`` .
|
|
441
|
+
poll_interval (int, optional): The polling period of the monitoring process, in seconds.
|
|
442
|
+
Default value: ``2``.
|
|
443
|
+
|
|
444
|
+
Raises:
|
|
445
|
+
RuntimeError: When create shared memory times exceeds max times.
|
|
446
|
+
|
|
447
|
+
Supported Platforms:
|
|
448
|
+
``Ascend`` ``GPU``
|
|
449
|
+
|
|
450
|
+
Examples:
|
|
451
|
+
>>> import numpy as np
|
|
452
|
+
>>> import mindspore as ms
|
|
453
|
+
>>> from mindspore import nn
|
|
454
|
+
>>> import mindspore.dataset as ds
|
|
455
|
+
>>> from mindspore.profiler import DynamicProfilerMonitor
|
|
456
|
+
>>>
|
|
457
|
+
>>> class Net(nn.Cell):
|
|
458
|
+
... def __init__(self):
|
|
459
|
+
... super(Net, self).__init__()
|
|
460
|
+
... self.fc = nn.Dense(2,2)
|
|
461
|
+
... def construct(self, x):
|
|
462
|
+
... return self.fc(x)
|
|
463
|
+
>>>
|
|
464
|
+
>>> def generator():
|
|
465
|
+
... for i in range(2):
|
|
466
|
+
... yield (np.ones([2, 2]).astype(np.float32), np.ones([2]).astype(np.int32))
|
|
467
|
+
>>>
|
|
468
|
+
>>> def train(net):
|
|
469
|
+
... optimizer = nn.Momentum(net.trainable_params(), 1, 0.9)
|
|
470
|
+
... loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True)
|
|
471
|
+
... data = ds.GeneratorDataset(generator, ["data", "label"])
|
|
472
|
+
... dynprof_cb = DynamicProfilerMonitor(cfg_path="./dyn_cfg", output_path="./dyn_prof_data")
|
|
473
|
+
... model = ms.train.Model(net, loss, optimizer)
|
|
474
|
+
... # register DynamicProfilerMonitor to model.train()
|
|
475
|
+
... model.train(10, data, callbacks=[dynprof_cb])
|
|
476
|
+
"""
|
|
477
|
+
|
|
478
|
+
def __init__(self, cfg_path, output_path="./dyn_profile_data", poll_interval=2, **kwargs):
|
|
479
|
+
if not isinstance(cfg_path, str):
|
|
480
|
+
raise TypeError("The cfg_path must be a string.")
|
|
481
|
+
if not isinstance(output_path, str):
|
|
482
|
+
logger.warning(f"The output_path must be a string, "
|
|
483
|
+
f"but got type {type(output_path)}, it will be set to './dyn_profile_data'.")
|
|
484
|
+
output_path = "./dyn_profile_data"
|
|
485
|
+
super().__init__(cfg_path, output_path, poll_interval, **kwargs)
|
|
486
|
+
|
|
487
|
+
def _get_prof_args(self):
|
|
488
|
+
""" Get prof_args py38"""
|
|
489
|
+
return DynamicProfilerArgs.from_bytes(self._shm.buf[:DynamicProfilerArgs.SIZE])
|
|
490
|
+
|
|
491
|
+
def _clean_resource(self):
|
|
492
|
+
"""Clean resource py38"""
|
|
493
|
+
# stop profiler when stop_step over all train step
|
|
494
|
+
if self._profiler:
|
|
495
|
+
self._profiler.stop()
|
|
496
|
+
self._profiler._ascend_profiler.finalize()
|
|
497
|
+
self._profiler = None
|
|
498
|
+
logger.warning("Rank %d Dynamic profiler stop at end of training", self._rank_id)
|
|
499
|
+
|
|
500
|
+
# join process
|
|
501
|
+
if self._process:
|
|
502
|
+
self._shared_loop_flag.value = False
|
|
503
|
+
self._process.join()
|
|
504
|
+
self._process = None
|
|
505
|
+
logger.info("Rank %s process stop", self._rank_id)
|
|
506
|
+
|
|
507
|
+
# clear shared memory
|
|
508
|
+
if self._shm:
|
|
509
|
+
try:
|
|
510
|
+
self._shm.close()
|
|
511
|
+
self._shm.unlink()
|
|
512
|
+
logger.info("Rank %s unlink shm", self._rank_id)
|
|
513
|
+
except FileNotFoundError:
|
|
514
|
+
logger.warning("Rank %s unlink shm failed, may be removed", self._rank_id)
|
|
515
|
+
self._shm = None
|
|
516
|
+
|
|
517
|
+
def _create_shm(self):
|
|
518
|
+
"""Create a json monitor process based on whether the SharedMemory is successfully created py38"""
|
|
519
|
+
try_times = 10
|
|
520
|
+
while try_times:
|
|
521
|
+
try:
|
|
522
|
+
# Step 1: try to open shm file, first time shm not exists.
|
|
523
|
+
# Python incorrectly tracks shared memory even if it is not
|
|
524
|
+
# created by the process. The following patch is a workaround.
|
|
525
|
+
with patch("multiprocessing.resource_tracker.register",
|
|
526
|
+
lambda *args, **kwargs: None):
|
|
527
|
+
self._shm = shared_memory.SharedMemory(name=self._shm_name)
|
|
528
|
+
self._is_create_process = False
|
|
529
|
+
logger.info("Rank %d shared memory is connected.", self._rank_id)
|
|
530
|
+
break
|
|
531
|
+
except FileNotFoundError:
|
|
532
|
+
try:
|
|
533
|
+
# Step 2: only one process can create shm successfully.
|
|
534
|
+
self._shm = shared_memory.SharedMemory(name=self._shm_name,
|
|
535
|
+
create=True, size=DynamicProfilerArgs.SIZE)
|
|
536
|
+
self._is_create_process = True
|
|
537
|
+
logger.info("Rank %d shared memory is created.", self._rank_id)
|
|
538
|
+
break
|
|
539
|
+
except FileExistsError:
|
|
540
|
+
# other process will go to step 1 and open shm file
|
|
541
|
+
try_times -= 1
|
|
542
|
+
logger.warning("Rank %d shared memory create failed, "
|
|
543
|
+
"retry times = %d.", self._rank_id, try_times)
|
|
544
|
+
time.sleep(random.uniform(0, 0.02)) # sleep 0 ~ 20 ms
|
|
545
|
+
|
|
546
|
+
if try_times <= 0:
|
|
547
|
+
raise RuntimeError(f"Rank {self._rank_id} failed to create shared memory.")
|
|
548
|
+
|
|
549
|
+
else:
|
|
550
|
+
import mmap
|
|
551
|
+
import stat
|
|
552
|
+
|
|
553
|
+
class DynamicProfilerMonitor(DynamicProfilerMonitorBase):
|
|
554
|
+
r"""
|
|
555
|
+
This class to enable the dynamic profile monitoring of MindSpore neural networks.
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
cfg_path (str): Dynamic profile json config file directory. The requirement is a shared path
|
|
559
|
+
that can be accessed by all nodes.
|
|
560
|
+
output_path (str, optional): Output data path. Default: ``"./dyn_profile_data"`` .
|
|
561
|
+
poll_interval (int, optional): The polling period of the monitoring process, in seconds.
|
|
562
|
+
Default value: ``2``.
|
|
563
|
+
|
|
564
|
+
Raises:
|
|
565
|
+
RuntimeError: When create shared memory times exceeds max times.
|
|
566
|
+
|
|
567
|
+
Supported Platforms:
|
|
568
|
+
``Ascend`` ``GPU``
|
|
569
|
+
|
|
570
|
+
Examples:
|
|
571
|
+
>>> import numpy as np
|
|
572
|
+
>>> import mindspore as ms
|
|
573
|
+
>>> from mindspore import nn
|
|
574
|
+
>>> import mindspore.dataset as ds
|
|
575
|
+
>>> from mindspore.profiler import DynamicProfilerMonitor
|
|
576
|
+
>>>
|
|
577
|
+
>>> class Net(nn.Cell):
|
|
578
|
+
... def __init__(self):
|
|
579
|
+
... super(Net, self).__init__()
|
|
580
|
+
... self.fc = nn.Dense(2,2)
|
|
581
|
+
... def construct(self, x):
|
|
582
|
+
... return self.fc(x)
|
|
583
|
+
>>>
|
|
584
|
+
>>> def generator():
|
|
585
|
+
... for i in range(2):
|
|
586
|
+
... yield (np.ones([2, 2]).astype(np.float32), np.ones([2]).astype(np.int32))
|
|
587
|
+
>>>
|
|
588
|
+
>>> def train(net):
|
|
589
|
+
... optimizer = nn.Momentum(net.trainable_params(), 1, 0.9)
|
|
590
|
+
... loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True)
|
|
591
|
+
... data = ds.GeneratorDataset(generator, ["data", "label"])
|
|
592
|
+
... dynprof_cb = DynamicProfilerMonitor(cfg_path="./dyn_cfg", output_path="./dyn_prof_data")
|
|
593
|
+
... model = ms.train.Model(net, loss, optimizer)
|
|
594
|
+
... # register DynamicProfilerMonitor to model.train()
|
|
595
|
+
... model.train(10, data, callbacks=[dynprof_cb])
|
|
596
|
+
"""
|
|
597
|
+
|
|
598
|
+
def __init__(self, cfg_path, output_path="./dyn_profile_data", poll_interval=2, **kwargs):
|
|
599
|
+
if not isinstance(cfg_path, str):
|
|
600
|
+
raise TypeError("The cfg_path must be a string.")
|
|
601
|
+
if not isinstance(output_path, str):
|
|
602
|
+
logger.warning(f"The output_path must be a string, "
|
|
603
|
+
f"but got type {type(output_path)}, it will be set to './dyn_profile_data'.")
|
|
604
|
+
output_path = "./dyn_profile_data"
|
|
605
|
+
self._cfg_path = cfg_path
|
|
606
|
+
self._shm_name = time.strftime("DynamicProfileShm%Y%m%d%H", time.localtime())
|
|
607
|
+
self._shm_dir = os.path.join(self._cfg_path, "shm")
|
|
608
|
+
PathManager.make_dir_safety(self._shm_dir)
|
|
609
|
+
self._shm_path = os.path.realpath(os.path.join(self._shm_dir, self._shm_name))
|
|
610
|
+
|
|
611
|
+
super().__init__(cfg_path, output_path, poll_interval, **kwargs)
|
|
612
|
+
logger.warning("Dynamic profiler is not work well on python 3.7x, "
|
|
613
|
+
"please update to python 3.8+ for better performance.")
|
|
614
|
+
|
|
615
|
+
def _get_prof_args(self):
|
|
616
|
+
""" Get prof_args py37"""
|
|
617
|
+
self._shm.seek(0)
|
|
618
|
+
return DynamicProfilerArgs.from_bytes(self._shm.read(DynamicProfilerArgs.SIZE))
|
|
619
|
+
|
|
620
|
+
def _clean_resource(self):
|
|
621
|
+
"""Clean resource py37"""
|
|
622
|
+
# stop profiler when stop_step over all train step
|
|
623
|
+
if self._profiler:
|
|
624
|
+
self._profiler.stop()
|
|
625
|
+
self._profiler._ascend_profiler.finalize()
|
|
626
|
+
self._profiler = None
|
|
627
|
+
logger.warning("Rank %d Dynamic profiler stop at end of training", self._rank_id)
|
|
628
|
+
|
|
629
|
+
# join process
|
|
630
|
+
if self._process:
|
|
631
|
+
self._shared_loop_flag.value = False
|
|
632
|
+
self._process.join()
|
|
633
|
+
self._process = None
|
|
634
|
+
logger.info("Rank %s process stop", self._rank_id)
|
|
635
|
+
|
|
636
|
+
# clear shared memory
|
|
637
|
+
if self._shm and self._is_create_process:
|
|
638
|
+
try:
|
|
639
|
+
self._shm.close()
|
|
640
|
+
if self._memory_mapped_file and not self._memory_mapped_file.closed:
|
|
641
|
+
self._memory_mapped_file.close()
|
|
642
|
+
elif self.fd:
|
|
643
|
+
os.close(self.fd)
|
|
644
|
+
PathManager.remove_file_safety(self._shm_path)
|
|
645
|
+
logger.info("Rank %s unlink shm", self._rank_id)
|
|
646
|
+
except FileNotFoundError:
|
|
647
|
+
logger.warning("Rank %s unlink shm failed, may be removed", self._rank_id)
|
|
648
|
+
self._shm = None
|
|
649
|
+
|
|
650
|
+
def _create_shm(self):
|
|
651
|
+
"""Create a json monitor process based on whether the SharedMemory is successfully created py37"""
|
|
652
|
+
|
|
653
|
+
try_times = 10
|
|
654
|
+
while try_times:
|
|
655
|
+
try:
|
|
656
|
+
# Step 1: try to open fd, first time fd not exists.
|
|
657
|
+
self.fd = os.open(self._shm_path, os.O_EXCL | os.O_RDWR,
|
|
658
|
+
stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP)
|
|
659
|
+
self._memory_mapped_file = os.fdopen(self.fd, 'rb')
|
|
660
|
+
self._shm = mmap.mmap(self._memory_mapped_file.fileno(), length=DynamicProfilerArgs.SIZE)
|
|
661
|
+
self._is_create_process = False
|
|
662
|
+
logger.info("Rank %d shared memory is connected.", self._rank_id)
|
|
663
|
+
break
|
|
664
|
+
except ValueError:
|
|
665
|
+
time.sleep(0.02)
|
|
666
|
+
except FileNotFoundError:
|
|
667
|
+
try:
|
|
668
|
+
# Step 2: only one process can create fd successfully.
|
|
669
|
+
fd = os.open(self._shm_path, os.O_CREAT | os.O_EXCL | os.O_RDWR,
|
|
670
|
+
stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP)
|
|
671
|
+
|
|
672
|
+
# Init mmap file need to write data
|
|
673
|
+
with os.fdopen(fd, 'wb') as f:
|
|
674
|
+
data_instance = DynamicProfilerArgs()
|
|
675
|
+
byte_data = data_instance.to_bytes()
|
|
676
|
+
f.write(byte_data)
|
|
677
|
+
|
|
678
|
+
# create mmap
|
|
679
|
+
self.fd = os.open(self._shm_path, os.O_EXCL | os.O_RDWR,
|
|
680
|
+
stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP)
|
|
681
|
+
self._memory_mapped_file = os.fdopen(self.fd, 'rb')
|
|
682
|
+
self._shm = mmap.mmap(self._memory_mapped_file.fileno(), length=DynamicProfilerArgs.SIZE)
|
|
683
|
+
self._is_create_process = True
|
|
684
|
+
logger.info("Rank %d shared memory is created.", self._rank_id)
|
|
685
|
+
break
|
|
686
|
+
except FileExistsError:
|
|
687
|
+
# other process will go to step 1 and open shm file
|
|
688
|
+
try_times -= 1
|
|
689
|
+
logger.warning("Rank %d shared memory create failed, "
|
|
690
|
+
"retry times = %d.", self._rank_id, try_times)
|
|
691
|
+
time.sleep(random.uniform(0, 0.02)) # sleep 0 ~ 20 ms
|
|
692
|
+
|
|
693
|
+
if try_times <= 0:
|
|
694
|
+
raise RuntimeError("Failed to create shared memory.")
|