mindspore 2.3.0__cp39-cp39-win_amd64.whl → 2.4.0__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/__init__.py +3 -1
- mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +50 -9
- mindspore/_extends/parse/compile_config.py +41 -0
- mindspore/_extends/parse/parser.py +9 -7
- mindspore/_extends/parse/standard_method.py +52 -14
- mindspore/_extends/pijit/pijit_func_white_list.py +350 -24
- mindspore/amp.py +24 -10
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/common/__init__.py +6 -4
- mindspore/common/_pijit_context.py +190 -0
- mindspore/common/_register_for_tensor.py +2 -1
- mindspore/common/_tensor_overload.py +139 -0
- mindspore/common/api.py +102 -87
- mindspore/common/dump.py +5 -6
- mindspore/common/generator.py +1 -7
- mindspore/common/hook_handle.py +14 -26
- mindspore/common/mindir_util.py +2 -2
- mindspore/common/parameter.py +46 -13
- mindspore/common/recompute.py +39 -9
- mindspore/common/sparse_tensor.py +7 -3
- mindspore/common/tensor.py +209 -29
- mindspore/communication/__init__.py +1 -1
- mindspore/communication/_comm_helper.py +38 -3
- mindspore/communication/comm_func.py +310 -55
- mindspore/communication/management.py +14 -14
- mindspore/context.py +123 -22
- mindspore/dataset/__init__.py +1 -1
- mindspore/dataset/audio/__init__.py +1 -1
- mindspore/dataset/core/config.py +7 -0
- mindspore/dataset/core/validator_helpers.py +7 -0
- mindspore/dataset/engine/cache_client.py +1 -1
- mindspore/dataset/engine/datasets.py +72 -44
- mindspore/dataset/engine/datasets_audio.py +7 -7
- mindspore/dataset/engine/datasets_standard_format.py +53 -3
- mindspore/dataset/engine/datasets_text.py +20 -20
- mindspore/dataset/engine/datasets_user_defined.py +174 -104
- mindspore/dataset/engine/datasets_vision.py +33 -33
- mindspore/dataset/engine/iterators.py +29 -0
- mindspore/dataset/engine/obs/util.py +7 -0
- mindspore/dataset/engine/queue.py +114 -60
- mindspore/dataset/engine/serializer_deserializer.py +2 -2
- mindspore/dataset/engine/validators.py +34 -14
- mindspore/dataset/text/__init__.py +1 -4
- mindspore/dataset/transforms/__init__.py +0 -3
- mindspore/dataset/utils/line_reader.py +2 -0
- mindspore/dataset/vision/__init__.py +1 -4
- mindspore/dataset/vision/utils.py +1 -1
- mindspore/dataset/vision/validators.py +2 -1
- mindspore/dnnl.dll +0 -0
- mindspore/{nn/extend → experimental/es}/__init__.py +4 -11
- mindspore/experimental/es/embedding_service.py +883 -0
- mindspore/{nn/layer → experimental/es}/embedding_service_layer.py +218 -30
- mindspore/experimental/llm_boost/__init__.py +21 -0
- mindspore/{nn/extend/layer → experimental/llm_boost/atb}/__init__.py +4 -8
- mindspore/experimental/llm_boost/atb/boost_base.py +211 -0
- mindspore/experimental/llm_boost/atb/llama_boost.py +115 -0
- mindspore/experimental/llm_boost/atb/qwen_boost.py +101 -0
- mindspore/experimental/llm_boost/register.py +129 -0
- mindspore/experimental/llm_boost/utils.py +31 -0
- mindspore/experimental/optim/adamw.py +85 -0
- mindspore/experimental/optim/optimizer.py +3 -0
- mindspore/hal/__init__.py +3 -3
- mindspore/hal/contiguous_tensors_handle.py +175 -0
- mindspore/hal/stream.py +18 -0
- mindspore/include/api/model_group.h +13 -1
- mindspore/include/api/types.h +10 -10
- mindspore/include/dataset/config.h +2 -2
- mindspore/include/dataset/constants.h +2 -2
- mindspore/include/dataset/execute.h +2 -2
- mindspore/include/dataset/vision.h +4 -0
- mindspore/jpeg62.dll +0 -0
- mindspore/log.py +1 -1
- mindspore/mindrecord/filewriter.py +68 -51
- mindspore/mindspore_backend.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_np_dtype.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/mint/__init__.py +495 -46
- mindspore/mint/distributed/__init__.py +31 -0
- mindspore/mint/distributed/distributed.py +254 -0
- mindspore/mint/nn/__init__.py +266 -21
- mindspore/mint/nn/functional.py +125 -19
- mindspore/mint/nn/layer/__init__.py +39 -0
- mindspore/mint/nn/layer/activation.py +133 -0
- mindspore/mint/nn/layer/normalization.py +477 -0
- mindspore/mint/nn/layer/pooling.py +110 -0
- mindspore/mint/optim/adamw.py +28 -7
- mindspore/mint/special/__init__.py +63 -0
- mindspore/multiprocessing/__init__.py +2 -1
- mindspore/nn/__init__.py +0 -1
- mindspore/nn/cell.py +275 -93
- mindspore/nn/layer/activation.py +211 -44
- mindspore/nn/layer/basic.py +113 -3
- mindspore/nn/layer/embedding.py +120 -2
- mindspore/nn/layer/normalization.py +101 -5
- mindspore/nn/layer/padding.py +34 -48
- mindspore/nn/layer/pooling.py +161 -7
- mindspore/nn/layer/transformer.py +3 -3
- mindspore/nn/loss/__init__.py +2 -2
- mindspore/nn/loss/loss.py +84 -6
- mindspore/nn/optim/__init__.py +2 -1
- mindspore/nn/optim/adadelta.py +1 -1
- mindspore/nn/optim/adam.py +1 -1
- mindspore/nn/optim/lamb.py +1 -1
- mindspore/nn/optim/tft_wrapper.py +127 -0
- mindspore/nn/wrap/cell_wrapper.py +12 -23
- mindspore/nn/wrap/grad_reducer.py +5 -5
- mindspore/nn/wrap/loss_scale.py +17 -3
- mindspore/numpy/__init__.py +1 -1
- mindspore/numpy/array_creations.py +65 -68
- mindspore/numpy/array_ops.py +64 -60
- mindspore/numpy/fft.py +610 -75
- mindspore/numpy/logic_ops.py +11 -10
- mindspore/numpy/math_ops.py +85 -84
- mindspore/numpy/utils_const.py +4 -4
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/__init__.py +6 -4
- mindspore/ops/_grad_experimental/grad_comm_ops.py +47 -3
- mindspore/ops/_grad_experimental/grad_math_ops.py +0 -22
- mindspore/ops/_vmap/vmap_array_ops.py +2 -4
- mindspore/ops/_vmap/vmap_math_ops.py +17 -1
- mindspore/ops/_vmap/vmap_nn_ops.py +43 -2
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +85 -7
- mindspore/ops/auto_generate/gen_arg_dtype_cast.py +2 -0
- mindspore/ops/auto_generate/gen_extend_func.py +734 -13
- mindspore/ops/auto_generate/gen_ops_def.py +2420 -381
- mindspore/ops/auto_generate/gen_ops_prim.py +5196 -1659
- mindspore/ops/auto_generate/pyboost_inner_prim.py +176 -56
- mindspore/ops/composite/base.py +85 -48
- mindspore/ops/composite/multitype_ops/_compile_utils.py +1 -0
- mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -2
- mindspore/ops/function/__init__.py +22 -0
- mindspore/ops/function/array_func.py +490 -153
- mindspore/ops/function/debug_func.py +113 -1
- mindspore/ops/function/fft_func.py +15 -2
- mindspore/ops/function/grad/grad_func.py +3 -2
- mindspore/ops/function/math_func.py +558 -207
- mindspore/ops/function/nn_func.py +817 -383
- mindspore/ops/function/other_func.py +3 -2
- mindspore/ops/function/random_func.py +184 -8
- mindspore/ops/function/reshard_func.py +13 -11
- mindspore/ops/function/sparse_unary_func.py +1 -1
- mindspore/ops/function/vmap_func.py +3 -2
- mindspore/ops/functional.py +24 -14
- mindspore/ops/op_info_register.py +3 -3
- mindspore/ops/operations/__init__.py +6 -1
- mindspore/ops/operations/_grad_ops.py +2 -76
- mindspore/ops/operations/_infer_ops.py +1 -1
- mindspore/ops/operations/_inner_ops.py +71 -94
- mindspore/ops/operations/array_ops.py +12 -146
- mindspore/ops/operations/comm_ops.py +42 -53
- mindspore/ops/operations/custom_ops.py +83 -19
- mindspore/ops/operations/debug_ops.py +42 -10
- mindspore/ops/operations/manually_defined/_inner.py +12 -0
- mindspore/ops/operations/manually_defined/ops_def.py +265 -10
- mindspore/ops/operations/math_ops.py +12 -223
- mindspore/ops/operations/nn_ops.py +20 -114
- mindspore/ops/operations/other_ops.py +7 -4
- mindspore/ops/operations/random_ops.py +46 -1
- mindspore/ops/primitive.py +18 -6
- mindspore/ops_generate/arg_dtype_cast.py +2 -0
- mindspore/ops_generate/gen_aclnn_implement.py +11 -11
- mindspore/ops_generate/gen_constants.py +36 -0
- mindspore/ops_generate/gen_ops.py +67 -52
- mindspore/ops_generate/gen_ops_inner_prim.py +1 -1
- mindspore/ops_generate/gen_pyboost_func.py +131 -47
- mindspore/ops_generate/op_proto.py +10 -3
- mindspore/ops_generate/pyboost_utils.py +14 -1
- mindspore/ops_generate/template.py +43 -21
- mindspore/parallel/__init__.py +3 -1
- mindspore/parallel/_auto_parallel_context.py +28 -8
- mindspore/parallel/_cell_wrapper.py +83 -0
- mindspore/parallel/_parallel_serialization.py +47 -19
- mindspore/parallel/_tensor.py +81 -11
- mindspore/parallel/_utils.py +13 -1
- mindspore/parallel/algo_parameter_config.py +5 -5
- mindspore/parallel/checkpoint_transform.py +46 -39
- mindspore/parallel/cluster/process_entity/__init__.py +1 -1
- mindspore/parallel/cluster/process_entity/_api.py +31 -23
- mindspore/parallel/cluster/process_entity/_utils.py +2 -27
- mindspore/parallel/parameter_broadcast.py +3 -4
- mindspore/parallel/shard.py +162 -31
- mindspore/parallel/transform_safetensors.py +993 -0
- mindspore/profiler/__init__.py +2 -1
- mindspore/profiler/common/constant.py +29 -0
- mindspore/profiler/common/registry.py +47 -0
- mindspore/profiler/common/util.py +28 -0
- mindspore/profiler/dynamic_profiler.py +694 -0
- mindspore/profiler/envprofiling.py +17 -19
- mindspore/profiler/parser/ascend_analysis/constant.py +18 -0
- mindspore/profiler/parser/ascend_analysis/file_manager.py +25 -4
- mindspore/profiler/parser/ascend_analysis/function_event.py +43 -19
- mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +31 -26
- mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +56 -10
- mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +55 -8
- mindspore/profiler/parser/ascend_analysis/path_manager.py +313 -0
- mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +27 -20
- mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +9 -2
- mindspore/profiler/parser/ascend_msprof_exporter.py +5 -4
- mindspore/profiler/parser/ascend_timeline_generator.py +27 -25
- mindspore/profiler/parser/base_timeline_generator.py +19 -25
- mindspore/profiler/parser/cpu_gpu_timeline_generator.py +25 -12
- mindspore/profiler/parser/framework_parser.py +1 -391
- mindspore/profiler/parser/gpu_analysis/__init__.py +14 -0
- mindspore/profiler/parser/gpu_analysis/function_event.py +44 -0
- mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +89 -0
- mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +72 -0
- mindspore/profiler/parser/memory_usage_parser.py +0 -154
- mindspore/profiler/parser/profiler_info.py +78 -6
- mindspore/profiler/profiler.py +153 -0
- mindspore/profiler/profiling.py +280 -412
- mindspore/rewrite/__init__.py +1 -2
- mindspore/rewrite/common/namespace.py +4 -4
- mindspore/rewrite/symbol_tree/symbol_tree.py +3 -3
- mindspore/run_check/_check_version.py +36 -103
- mindspore/safeguard/rewrite_obfuscation.py +591 -247
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/__init__.py +4 -3
- mindspore/train/_utils.py +28 -2
- mindspore/train/amp.py +171 -53
- mindspore/train/callback/__init__.py +2 -2
- mindspore/train/callback/_callback.py +4 -4
- mindspore/train/callback/_checkpoint.py +85 -22
- mindspore/train/callback/_cluster_monitor.py +1 -1
- mindspore/train/callback/_flops_collector.py +1 -0
- mindspore/train/callback/_loss_monitor.py +3 -3
- mindspore/train/callback/_on_request_exit.py +134 -31
- mindspore/train/callback/_summary_collector.py +5 -5
- mindspore/train/callback/_tft_register.py +352 -0
- mindspore/train/dataset_helper.py +7 -3
- mindspore/train/metrics/metric.py +3 -3
- mindspore/train/metrics/roc.py +4 -4
- mindspore/train/mind_ir_pb2.py +44 -39
- mindspore/train/model.py +134 -58
- mindspore/train/serialization.py +336 -112
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +21 -0
- mindspore/utils/utils.py +60 -0
- mindspore/version.py +1 -1
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/METADATA +6 -2
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/RECORD +258 -252
- mindspore/include/c_api/ms/abstract.h +0 -67
- mindspore/include/c_api/ms/attribute.h +0 -197
- mindspore/include/c_api/ms/base/handle_types.h +0 -43
- mindspore/include/c_api/ms/base/macros.h +0 -32
- mindspore/include/c_api/ms/base/status.h +0 -33
- mindspore/include/c_api/ms/base/types.h +0 -283
- mindspore/include/c_api/ms/context.h +0 -102
- mindspore/include/c_api/ms/graph.h +0 -160
- mindspore/include/c_api/ms/node.h +0 -606
- mindspore/include/c_api/ms/tensor.h +0 -161
- mindspore/include/c_api/ms/value.h +0 -84
- mindspore/mindspore_shared_lib.dll +0 -0
- mindspore/nn/extend/basic.py +0 -140
- mindspore/nn/extend/embedding.py +0 -143
- mindspore/nn/extend/layer/normalization.py +0 -109
- mindspore/nn/extend/pooling.py +0 -117
- mindspore/nn/layer/embedding_service.py +0 -531
- mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +0 -93
- mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +0 -66
- mindspore/ops/extend/__init__.py +0 -53
- mindspore/ops/extend/array_func.py +0 -218
- mindspore/ops/extend/math_func.py +0 -76
- mindspore/ops/extend/nn_func.py +0 -308
- mindspore/ops/silent_check.py +0 -162
- mindspore/profiler/parser/msadvisor_analyzer.py +0 -82
- mindspore/profiler/parser/msadvisor_parser.py +0 -240
- mindspore/train/callback/_mindio_ttp.py +0 -443
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/WHEEL +0 -0
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/entry_points.txt +0 -0
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/top_level.txt +0 -0
|
@@ -1,240 +0,0 @@
|
|
|
1
|
-
# Copyright 2022 Huawei Technologies Co., Ltd
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
# ============================================================================
|
|
15
|
-
"""
|
|
16
|
-
MSAdvisor AICPU model parser.
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
import os
|
|
20
|
-
import stat
|
|
21
|
-
import shutil
|
|
22
|
-
import json
|
|
23
|
-
|
|
24
|
-
from mindspore import log as logger
|
|
25
|
-
from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException
|
|
26
|
-
from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
MIN_TO_US = 60000000 # 1 min to us
|
|
30
|
-
MS_TO_US = 1000 # 1 ms to us
|
|
31
|
-
AICPU_STREAM_ID = 9000 # aicpu stream id in profiler
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class MsadvisorParser:
|
|
35
|
-
"""
|
|
36
|
-
Data format conversion for MSAdvisor AICPU model.
|
|
37
|
-
"""
|
|
38
|
-
|
|
39
|
-
def __init__(self, job_id, device_id, rank_id, output_path, pretty=False):
|
|
40
|
-
self._job_id = job_id
|
|
41
|
-
self._device_id = device_id
|
|
42
|
-
self._rank_id = str(rank_id)
|
|
43
|
-
self._output_path = output_path
|
|
44
|
-
self._aicore_path = ""
|
|
45
|
-
self._aicpu_path = ""
|
|
46
|
-
self._time_start = 0
|
|
47
|
-
self._time_end = 0
|
|
48
|
-
self._pretty = pretty
|
|
49
|
-
|
|
50
|
-
@property
|
|
51
|
-
def indent(self):
|
|
52
|
-
indent = 1 if self._pretty else None
|
|
53
|
-
return indent
|
|
54
|
-
|
|
55
|
-
@staticmethod
|
|
56
|
-
def check_clear_make_dir(dir_path):
|
|
57
|
-
"""
|
|
58
|
-
Check if dir exists, then clear the dir and make a new dir.
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
dir_path (str): dir path is needed to clear and make.
|
|
62
|
-
|
|
63
|
-
Return:
|
|
64
|
-
str, new dir path.
|
|
65
|
-
"""
|
|
66
|
-
dir_path = validate_and_normalize_path(dir_path)
|
|
67
|
-
if os.path.exists(dir_path):
|
|
68
|
-
shutil.rmtree(dir_path)
|
|
69
|
-
os.makedirs(dir_path, stat.S_IRWXU)
|
|
70
|
-
return dir_path
|
|
71
|
-
|
|
72
|
-
@staticmethod
|
|
73
|
-
def generate_aicore_json(aicore_info, tid):
|
|
74
|
-
"""
|
|
75
|
-
Generate dict of operation information which be dumped into json file.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
aicore_info (str): str read from aicore timeline file.
|
|
79
|
-
tid (int): Task Id.
|
|
80
|
-
|
|
81
|
-
Return:
|
|
82
|
-
dict, dict of operation information which can be dumped into json file.
|
|
83
|
-
|
|
84
|
-
Raises:
|
|
85
|
-
ValueError: If the value of aicore attrributes cannot be converted to float.
|
|
86
|
-
"""
|
|
87
|
-
op = aicore_info.split(",")
|
|
88
|
-
name = op[0]
|
|
89
|
-
pid = 0
|
|
90
|
-
tid = tid - 1
|
|
91
|
-
task_type = "AI_CORE"
|
|
92
|
-
|
|
93
|
-
try:
|
|
94
|
-
ts, dur, sid = float(op[2]) * MS_TO_US, float(op[3]) * MS_TO_US, float(op[1])
|
|
95
|
-
except ValueError as err:
|
|
96
|
-
logger.warning("The aicore timeline file content is abnormal. Failed to format aicore timeline file")
|
|
97
|
-
raise err
|
|
98
|
-
finally:
|
|
99
|
-
pass
|
|
100
|
-
|
|
101
|
-
op = {
|
|
102
|
-
"name": name, "pid": pid, "ts": ts, "dur": dur,
|
|
103
|
-
"args": {"Task Type": task_type, "Stream Id": sid, "Task Id": tid}, "ph": "X"
|
|
104
|
-
}
|
|
105
|
-
return op
|
|
106
|
-
|
|
107
|
-
@staticmethod
|
|
108
|
-
def generate_aicpu_json(aicpu_info, tid):
|
|
109
|
-
"""
|
|
110
|
-
Generate dict of operation information which be dumped into json file.
|
|
111
|
-
|
|
112
|
-
Args:
|
|
113
|
-
aicpu_info (str): str read from aicpu timeline file.
|
|
114
|
-
tid (int): Task Id.
|
|
115
|
-
|
|
116
|
-
Return:
|
|
117
|
-
dict, dict of operation information which can be dumped into json file.
|
|
118
|
-
|
|
119
|
-
Raises:
|
|
120
|
-
ValueError: If the value of aicpu attrributes cannot be converted to float.
|
|
121
|
-
"""
|
|
122
|
-
op = aicpu_info.split(",")
|
|
123
|
-
name = op[1]
|
|
124
|
-
pid = 1
|
|
125
|
-
sid = AICPU_STREAM_ID
|
|
126
|
-
tid = tid - 1
|
|
127
|
-
task_type = "AI_CPU"
|
|
128
|
-
|
|
129
|
-
try:
|
|
130
|
-
ts = float(op[5])
|
|
131
|
-
dur = float(op[4]) * MS_TO_US
|
|
132
|
-
except ValueError as err:
|
|
133
|
-
logger.warning("The aicpu timeline file content is abnormal. Failed to format aicpu timeline file")
|
|
134
|
-
raise err
|
|
135
|
-
finally:
|
|
136
|
-
pass
|
|
137
|
-
|
|
138
|
-
op = {
|
|
139
|
-
"name": name, "pid": pid, "ts": ts, "dur": dur,
|
|
140
|
-
"args": {"Task Type": task_type, "Stream Id": sid, "Task Id": tid}, "ph": "X"
|
|
141
|
-
}
|
|
142
|
-
return op
|
|
143
|
-
|
|
144
|
-
def get_input_file(self):
|
|
145
|
-
"""
|
|
146
|
-
Get aicore and aicpu information file from specific path and rank id.
|
|
147
|
-
|
|
148
|
-
Raises:
|
|
149
|
-
ProfilerFileNotFoundException: If aicore timeline file does not exist.
|
|
150
|
-
ProfilerFileNotFoundException: If aicpu timeline file does not exist.
|
|
151
|
-
"""
|
|
152
|
-
self._aicore_path = "output_timeline_data_{}.txt".format(self._rank_id)
|
|
153
|
-
self._aicore_path = os.path.join(self._output_path, self._aicore_path)
|
|
154
|
-
self._aicore_path = validate_and_normalize_path(self._aicore_path)
|
|
155
|
-
|
|
156
|
-
self._aicpu_path = "aicpu_intermediate_{}.csv".format(self._rank_id)
|
|
157
|
-
self._aicpu_path = os.path.join(self._output_path, self._aicpu_path)
|
|
158
|
-
self._aicpu_path = validate_and_normalize_path(self._aicpu_path)
|
|
159
|
-
|
|
160
|
-
if not os.path.exists(self._aicore_path):
|
|
161
|
-
logger.warning('The aicore timeline file does not exist!')
|
|
162
|
-
raise ProfilerFileNotFoundException(msg=self._aicore_path)
|
|
163
|
-
if not os.path.exists(self._aicpu_path):
|
|
164
|
-
logger.warning('The aicpu timeline file does not exist!')
|
|
165
|
-
raise ProfilerFileNotFoundException(msg=self._aicpu_path)
|
|
166
|
-
|
|
167
|
-
def get_output_file(self):
|
|
168
|
-
"""
|
|
169
|
-
Get output path needed by MSAdvisor and created dir.
|
|
170
|
-
"""
|
|
171
|
-
msprof_file = os.path.join(self._output_path, "msadvisor")
|
|
172
|
-
msprof_file = os.path.join(msprof_file, "device_" + self._rank_id)
|
|
173
|
-
msprof_file = os.path.join(msprof_file, "profiling")
|
|
174
|
-
msprof_file = MsadvisorParser.check_clear_make_dir(msprof_file)
|
|
175
|
-
|
|
176
|
-
msprof_file = os.path.join(msprof_file, self._job_id)
|
|
177
|
-
msprof_file = os.path.join(msprof_file, "device_0", "timeline")
|
|
178
|
-
msprof_file = validate_and_normalize_path(msprof_file)
|
|
179
|
-
os.makedirs(msprof_file, stat.S_IRWXU)
|
|
180
|
-
|
|
181
|
-
msprof_file = os.path.join(msprof_file, "task_time_0_1_1.json")
|
|
182
|
-
self._output_path = msprof_file
|
|
183
|
-
|
|
184
|
-
def write_aicore(self):
|
|
185
|
-
"""
|
|
186
|
-
Read aicore information from file created by profiler and generate new file needed by MSAdvisor.
|
|
187
|
-
"""
|
|
188
|
-
aicore_file = self._aicore_path
|
|
189
|
-
output_file = self._output_path
|
|
190
|
-
|
|
191
|
-
with os.fdopen(os.open(output_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
|
|
192
|
-
stat.S_IRUSR | stat.S_IWUSR), "w") as output_file:
|
|
193
|
-
output_file.write("[")
|
|
194
|
-
with os.fdopen(os.open(aicore_file, os.O_RDONLY,
|
|
195
|
-
stat.S_IRUSR | stat.S_IWUSR), "r") as aicore_file:
|
|
196
|
-
for tid, aicore in enumerate(aicore_file):
|
|
197
|
-
if tid == 0:
|
|
198
|
-
continue
|
|
199
|
-
op = MsadvisorParser.generate_aicore_json(aicore, tid)
|
|
200
|
-
if tid == 1:
|
|
201
|
-
self._time_start = op.get("ts")
|
|
202
|
-
total_duration = op.get("ts") - self._time_start
|
|
203
|
-
if total_duration > 1 * MIN_TO_US or tid > 10000:
|
|
204
|
-
self._time_end = op.get("ts")
|
|
205
|
-
break
|
|
206
|
-
if tid > 1:
|
|
207
|
-
output_file.write(",")
|
|
208
|
-
json.dump(op, output_file, indent=self.indent)
|
|
209
|
-
|
|
210
|
-
def write_aicpu(self):
|
|
211
|
-
"""
|
|
212
|
-
Read aicpu information from file created by profiler and write into new file needed by MSAdvisor.
|
|
213
|
-
"""
|
|
214
|
-
aicpu_file = self._aicpu_path
|
|
215
|
-
output_file = self._output_path
|
|
216
|
-
|
|
217
|
-
with os.fdopen(os.open(output_file, os.O_WRONLY | os.O_APPEND,
|
|
218
|
-
stat.S_IRUSR | stat.S_IWUSR), "a") as output_file:
|
|
219
|
-
with os.fdopen(os.open(aicpu_file, os.O_RDONLY,
|
|
220
|
-
stat.S_IRUSR | stat.S_IWUSR), "r") as aicpu_file:
|
|
221
|
-
for tid, aicpu in enumerate(aicpu_file):
|
|
222
|
-
if tid == 0:
|
|
223
|
-
continue
|
|
224
|
-
op = MsadvisorParser.generate_aicpu_json(aicpu, tid)
|
|
225
|
-
if op is None:
|
|
226
|
-
continue
|
|
227
|
-
if op.get("ts") > self._time_end:
|
|
228
|
-
break
|
|
229
|
-
output_file.write(",")
|
|
230
|
-
json.dump(op, output_file, indent=self.indent)
|
|
231
|
-
output_file.write("]")
|
|
232
|
-
|
|
233
|
-
def parse(self):
|
|
234
|
-
"""
|
|
235
|
-
Interface to call all function in the class. Generated data for AICpu model in MSAdvisor.
|
|
236
|
-
"""
|
|
237
|
-
self.get_input_file()
|
|
238
|
-
self.get_output_file()
|
|
239
|
-
self.write_aicore()
|
|
240
|
-
self.write_aicpu()
|
|
@@ -1,443 +0,0 @@
|
|
|
1
|
-
# Copyright 2024 Huawei Technologies Co., Ltd
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
# ============================================================================
|
|
15
|
-
"""Checkpoint related classes and functions."""
|
|
16
|
-
|
|
17
|
-
import os
|
|
18
|
-
import sys
|
|
19
|
-
import copy
|
|
20
|
-
from mindspore.train.serialization import save_checkpoint, _convert_cell_param_and_names_to_dict, _get_merged_param_data
|
|
21
|
-
from mindspore.parallel._auto_parallel_context import _get_auto_parallel_context
|
|
22
|
-
from mindspore.parallel._utils import _get_device_num
|
|
23
|
-
from mindspore import _checkparam as Validator
|
|
24
|
-
from mindspore.train.callback._callback import Callback
|
|
25
|
-
from mindspore.common.tensor import Tensor
|
|
26
|
-
from mindspore import context
|
|
27
|
-
import mindspore as ms
|
|
28
|
-
from mindspore.communication import get_rank
|
|
29
|
-
from mindspore.parallel.checkpoint_transform import sync_pipeline_shared_parameters
|
|
30
|
-
|
|
31
|
-
from mindspore.train._utils import get_parameter_redundancy
|
|
32
|
-
from mindspore import log as logger
|
|
33
|
-
from mindspore.parallel._utils import _is_in_auto_parallel_mode
|
|
34
|
-
from mindspore.common.api import _get_parameter_layout
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def _get_dp_from_layout(parameter_layout_dict):
|
|
38
|
-
""" Get dp and tp from layout dict. """
|
|
39
|
-
pp_num = _get_auto_parallel_context("pipeline_stages")
|
|
40
|
-
dev_num = _get_device_num()
|
|
41
|
-
global_rank = get_rank()
|
|
42
|
-
pipe_size = dev_num // pp_num
|
|
43
|
-
initial_rank = (global_rank // pipe_size) * pipe_size
|
|
44
|
-
parameter_redundancy_dict = get_parameter_redundancy(
|
|
45
|
-
parameter_layout_dict, initial_rank)
|
|
46
|
-
value_len = sys.maxsize
|
|
47
|
-
min_value = ()
|
|
48
|
-
for key, value in parameter_redundancy_dict.items():
|
|
49
|
-
if "accu_grads" in key or "inputs" in key:
|
|
50
|
-
continue
|
|
51
|
-
for item in value:
|
|
52
|
-
if len(item) < value_len and global_rank in item:
|
|
53
|
-
value_len = len(item)
|
|
54
|
-
min_value = item
|
|
55
|
-
return min_value
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def _get_ckpt_dir(append_dict, ckpt_save_path, is_tmp_file):
|
|
59
|
-
""" Common func to generate ckpt dir name."""
|
|
60
|
-
tmp = "_tmp" if is_tmp_file else ""
|
|
61
|
-
mid_dir = f"ttp_saved_checkpoints-{str(append_dict['cur_epoch_num'])}_{str(append_dict['cur_step_num'])}{tmp}"
|
|
62
|
-
return os.path.join(ckpt_save_path, mid_dir)
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def _flush_from_cache(cb_params):
|
|
66
|
-
""" Flush cache data to host if tensor is cache enable."""
|
|
67
|
-
params = cb_params.train_network.get_parameters()
|
|
68
|
-
for param in params:
|
|
69
|
-
if param.cache_enable:
|
|
70
|
-
Tensor(param).flush_from_cache()
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def _save_checkpoint_on_failure(save_rank, step, rank_list, save_args):
|
|
74
|
-
""" Callback used for TTP save ckpt function when errors occur."""
|
|
75
|
-
logger.info("Enter _save_checkpoint_on_failure function")
|
|
76
|
-
ckpt_save_path, save_params, append_dict = save_args
|
|
77
|
-
ckpt_file = f"iteration-{str(append_dict['cur_epoch_num'])}_{str(append_dict['cur_step_num'])}.ckpt"
|
|
78
|
-
cur_ckpt_dir = _get_ckpt_dir(
|
|
79
|
-
append_dict, ckpt_save_path, True) + "/rank_" + str(save_rank)
|
|
80
|
-
os.makedirs(cur_ckpt_dir)
|
|
81
|
-
cur_file = os.path.join(cur_ckpt_dir, ckpt_file)
|
|
82
|
-
save_checkpoint(save_params, cur_file,
|
|
83
|
-
integrated_save=False, append_dict=append_dict)
|
|
84
|
-
logger.info("Finish _save_checkpoint_on_failure function")
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def _convert_net_to_param_list(save_obj):
|
|
88
|
-
"""Convert nn.Cell to param_list."""
|
|
89
|
-
sync_pipeline_shared_parameters(save_obj)
|
|
90
|
-
param_list = []
|
|
91
|
-
parameter_layout_dict = save_obj.parameter_layout_dict
|
|
92
|
-
if _is_in_auto_parallel_mode() and not parameter_layout_dict:
|
|
93
|
-
parameter_layout_dict = _get_parameter_layout()
|
|
94
|
-
if not _is_in_auto_parallel_mode():
|
|
95
|
-
save_obj.init_parameters_data()
|
|
96
|
-
param_dict = _convert_cell_param_and_names_to_dict(save_obj, None)
|
|
97
|
-
for (key, value) in param_dict.items():
|
|
98
|
-
each_param = {"name": key}
|
|
99
|
-
param_data = Tensor(value.asnumpy())
|
|
100
|
-
# in automatic model parallel scenario, some parameters were split to all the devices,
|
|
101
|
-
# which should be combined before saving
|
|
102
|
-
if key in parameter_layout_dict:
|
|
103
|
-
param_data = _get_merged_param_data(
|
|
104
|
-
save_obj, parameter_layout_dict, key, param_data, False)
|
|
105
|
-
each_param["data"] = param_data
|
|
106
|
-
param_list.append(each_param)
|
|
107
|
-
return param_list
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def _rename_save_result(rename_args):
|
|
111
|
-
""" Callback used for TTP rename function after ckpt save callback was finished and successful."""
|
|
112
|
-
logger.info("Enter _rename_save_result function")
|
|
113
|
-
ckpt_save_path, _, append_dict = rename_args
|
|
114
|
-
|
|
115
|
-
tmp_dir = _get_ckpt_dir(append_dict, ckpt_save_path, True)
|
|
116
|
-
fin_dir = _get_ckpt_dir(append_dict, ckpt_save_path, False)
|
|
117
|
-
|
|
118
|
-
os.rename(tmp_dir, fin_dir)
|
|
119
|
-
logger.info("Finish _rename_save_result function")
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
class MindIOTTPAdapter(Callback):
|
|
123
|
-
"""
|
|
124
|
-
This callback is used to enable the feature
|
|
125
|
-
`MindIO TTP <https://www.hiascend.com/document/detail/zh/mindx-dl/60rc1/mindio/mindiottp/mindiottp001.html>`_.
|
|
126
|
-
This callback will execute TTP operations during training process, such as TTP init, report and exception handle.
|
|
127
|
-
|
|
128
|
-
Note:
|
|
129
|
-
Required for Ascend GE LazyInline mode only. And pipline size must be greater than 1.
|
|
130
|
-
|
|
131
|
-
Args:
|
|
132
|
-
controller_ip (str): TTP controller's ip address, used for init TTP controller.
|
|
133
|
-
controller_port (int): TTP controller's ip port, used for init TTP controller and processor.
|
|
134
|
-
ckpt_save_path (str): Checkpoint save directory when failure occurs, checkpoint file will save to directory
|
|
135
|
-
named ttp_saved_checkpoints-{cur_epoch_num}_{cur_step_num} under this directory.
|
|
136
|
-
|
|
137
|
-
Raises:
|
|
138
|
-
Exception: TTP init failed.
|
|
139
|
-
ModuleNotFoundError: Mindio TTP whl package is not installed.
|
|
140
|
-
|
|
141
|
-
Examples:
|
|
142
|
-
>>> import numpy as np
|
|
143
|
-
>>> import os
|
|
144
|
-
>>> import math
|
|
145
|
-
>>> import mindspore as ms
|
|
146
|
-
>>> import mindspore.dataset as ds
|
|
147
|
-
>>> from mindspore import nn, ops, Parameter, train
|
|
148
|
-
>>> from mindspore.communication import init
|
|
149
|
-
>>> from mindspore.common.initializer import initializer, HeUniform
|
|
150
|
-
>>> from mindspore.train import Model, MindIOTTPAdapter
|
|
151
|
-
>>> from mindspore import dataset as ds
|
|
152
|
-
>>> ms.set_context(mode=ms.GRAPH_MODE, jit_level='O2')
|
|
153
|
-
>>> ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL, pipeline_stages=2)
|
|
154
|
-
>>> init()
|
|
155
|
-
>>> ms.set_seed(1)
|
|
156
|
-
>>> ms.set_auto_parallel_context(strategy_ckpt_config={"save_file":
|
|
157
|
-
>>> "./src_pipeline_strategys/src_strategy_{}.ckpt".format(get_rank())})
|
|
158
|
-
>>> class MatMulCell(nn.Cell):
|
|
159
|
-
... def __init__(self, param=None, shape=None):
|
|
160
|
-
... super().__init__()
|
|
161
|
-
... if shape is None:
|
|
162
|
-
... shape = [28 * 28, 512]
|
|
163
|
-
... weight_init = HeUniform(math.sqrt(5))
|
|
164
|
-
... self.param = Parameter(initializer(weight_init, shape), name="param")
|
|
165
|
-
... if param is not None:
|
|
166
|
-
... self.param = param
|
|
167
|
-
... self.print = ops.Print()
|
|
168
|
-
... self.matmul = ops.MatMul()
|
|
169
|
-
...
|
|
170
|
-
... def construct(self, x):
|
|
171
|
-
... out = self.matmul(x, self.param)
|
|
172
|
-
... self.print("out is:", out)
|
|
173
|
-
... return out
|
|
174
|
-
>>>
|
|
175
|
-
>>> class Network(nn.Cell):
|
|
176
|
-
... def __init__(self):
|
|
177
|
-
... super().__init__()
|
|
178
|
-
... self.flatten = nn.Flatten()
|
|
179
|
-
... self.layer1 = MatMulCell()
|
|
180
|
-
... self.relu1 = nn.ReLU()
|
|
181
|
-
... self.layer2 = nn.Dense(512, 512)
|
|
182
|
-
... self.relu2 = nn.ReLU()
|
|
183
|
-
... self.layer3 = nn.Dense(512, 10)
|
|
184
|
-
...
|
|
185
|
-
... def construct(self, x):
|
|
186
|
-
... x = self.flatten(x)
|
|
187
|
-
... x = self.layer1(x)
|
|
188
|
-
... x = self.relu1(x)
|
|
189
|
-
... x = self.layer2(x)
|
|
190
|
-
... x = self.relu2(x)
|
|
191
|
-
... logits = self.layer3(x)
|
|
192
|
-
... return logits
|
|
193
|
-
>>>
|
|
194
|
-
>>> net = Network()
|
|
195
|
-
>>> net.layer1.pipeline_stage = 0
|
|
196
|
-
>>> net.relu1.pipeline_stage = 0
|
|
197
|
-
>>> net.layer2.pipeline_stage = 0
|
|
198
|
-
>>> net.relu2.pipeline_stage = 1
|
|
199
|
-
>>> net.layer3.pipeline_stage = 1
|
|
200
|
-
>>>
|
|
201
|
-
>>> def create_dataset(batch_size):
|
|
202
|
-
... dataset_path = os.getenv("DATA_PATH")
|
|
203
|
-
... dataset = ds.MnistDataset(dataset_path)
|
|
204
|
-
... image_transforms = [
|
|
205
|
-
... ds.vision.Rescale(1.0 / 255.0, 0),
|
|
206
|
-
... ds.vision.Normalize(mean=(0.1307,), std=(0.3081,)),
|
|
207
|
-
... ds.vision.HWC2CHW()
|
|
208
|
-
... ]
|
|
209
|
-
... label_transform = ds.transforms.TypeCast(ms.int32)
|
|
210
|
-
... dataset = dataset.map(image_transforms, 'image')
|
|
211
|
-
... dataset = dataset.map(label_transform, 'label')
|
|
212
|
-
... dataset = dataset.batch(batch_size)
|
|
213
|
-
... return dataset
|
|
214
|
-
>>>
|
|
215
|
-
>>> data_set = create_dataset(32)
|
|
216
|
-
>>>
|
|
217
|
-
>>> optimizer = nn.SGD(net.trainable_params(), 1e-2)
|
|
218
|
-
>>> loss_fn = nn.CrossEntropyLoss()
|
|
219
|
-
>>>
|
|
220
|
-
>>> net_with_loss = nn.PipelineCell(nn.WithLossCell(net, loss_fn), 4)
|
|
221
|
-
>>> net_with_loss.set_train()
|
|
222
|
-
>>> model = Model(net_with_loss, optimizer=optimizer)
|
|
223
|
-
>>> ttp_cb = MindIOTTPAdapter("192.168.0.1", 2000, "./ttp_checkpoint/")
|
|
224
|
-
>>> loss_cb = train.LossMonitor(1)
|
|
225
|
-
>>> model.train(1, dataset, callbacks=[ttp_cb, loss_cb])
|
|
226
|
-
"""
|
|
227
|
-
|
|
228
|
-
def __init__(self, controller_ip, controller_port, ckpt_save_path):
|
|
229
|
-
super(MindIOTTPAdapter, self).__init__()
|
|
230
|
-
# let it raises errors if not install mindio_ttp package
|
|
231
|
-
from mindio_ttp import framework_ttp as ttp
|
|
232
|
-
self.ttp = ttp
|
|
233
|
-
Validator.check_non_negative_int(controller_port)
|
|
234
|
-
self.has_init = False
|
|
235
|
-
self.enable = False
|
|
236
|
-
mode = context.get_context("mode")
|
|
237
|
-
if context.get_context("device_target") != "Ascend" or mode != context.GRAPH_MODE:
|
|
238
|
-
logger.warning(
|
|
239
|
-
"MindIO adataper only support on Ascend device with GRAPH Mode.")
|
|
240
|
-
return
|
|
241
|
-
if os.getenv("MS_ENABLE_MINDIO_GRACEFUL_EXIT") != "true":
|
|
242
|
-
logger.warning("MindIO adataper need custom switch on.")
|
|
243
|
-
return
|
|
244
|
-
ttp_lib_path = os.getenv("MS_MINDIO_TTP_LIB_PATH")
|
|
245
|
-
if ttp_lib_path is None or os.path.isfile(ttp_lib_path) is False:
|
|
246
|
-
logger.warning(
|
|
247
|
-
"MindIO adataper switch on, but ttp library path is not correct.")
|
|
248
|
-
return
|
|
249
|
-
self.enable = True
|
|
250
|
-
self._controller_ip = controller_ip
|
|
251
|
-
self._controller_port = controller_port
|
|
252
|
-
self._ckpt_save_path = ckpt_save_path
|
|
253
|
-
|
|
254
|
-
def wrapper_ttp_persist(self, func):
|
|
255
|
-
"""
|
|
256
|
-
This method is used to wrapper TTP exception handler for the input func.
|
|
257
|
-
|
|
258
|
-
Args:
|
|
259
|
-
func (function): train method that need to be wrapper.
|
|
260
|
-
|
|
261
|
-
Returns:
|
|
262
|
-
Function, if the TTP is enabled, return the encapsulated function,
|
|
263
|
-
otherwise the original function is returned.
|
|
264
|
-
|
|
265
|
-
"""
|
|
266
|
-
if self.enable:
|
|
267
|
-
return self.ttp.ttp_to_persist(func)
|
|
268
|
-
return func
|
|
269
|
-
|
|
270
|
-
def _init_ttp(self, run_context):
|
|
271
|
-
""" Init Mindio TTP, used internal. """
|
|
272
|
-
logger.info("Begin to init ttp.")
|
|
273
|
-
dev_num = _get_device_num()
|
|
274
|
-
|
|
275
|
-
cb_params = run_context.original_args()
|
|
276
|
-
param_layout_dict = cb_params.train_network.parameter_layout_dict
|
|
277
|
-
dp = _get_dp_from_layout(param_layout_dict)
|
|
278
|
-
logger.info("Init TTP with dp: {}.".format(dp))
|
|
279
|
-
|
|
280
|
-
self.ttp.ttp_register_save_ckpt_handler(_save_checkpoint_on_failure)
|
|
281
|
-
self.ttp.ttp_register_rename_handler(_rename_save_result)
|
|
282
|
-
|
|
283
|
-
world_size = dev_num
|
|
284
|
-
cur_rank = get_rank()
|
|
285
|
-
is_odd = len(dp) % 2
|
|
286
|
-
replica = 2 if is_odd else len(dp) // 2
|
|
287
|
-
enable_local_copy = False
|
|
288
|
-
if cur_rank == 0:
|
|
289
|
-
logger.info("Begin to start ttp controller.")
|
|
290
|
-
self.ttp.ttp_init_controller(
|
|
291
|
-
cur_rank, world_size, replica, enable_local_copy)
|
|
292
|
-
self.ttp.ttp_start_controller(
|
|
293
|
-
self._controller_ip, self._controller_port)
|
|
294
|
-
logger.info("Finish start ttp controller.")
|
|
295
|
-
|
|
296
|
-
logger.info("Begin to start ttp processor.")
|
|
297
|
-
self.ttp.ttp_init_processor(cur_rank, dp, len(
|
|
298
|
-
dp), world_size, replica, enable_local_copy)
|
|
299
|
-
self.ttp.ttp_start_processor(
|
|
300
|
-
self._controller_ip, self._controller_port)
|
|
301
|
-
logger.info("Finished start ttp processor.")
|
|
302
|
-
|
|
303
|
-
logger.info("Finish init ttp.")
|
|
304
|
-
|
|
305
|
-
def on_train_step_end(self, run_context):
|
|
306
|
-
"""
|
|
307
|
-
Init TTP Controller only once after first step finished.
|
|
308
|
-
And report status to MindIO TTP after every step finished.
|
|
309
|
-
|
|
310
|
-
Args:
|
|
311
|
-
run_context (RunContext): Context of the train running. Refer to
|
|
312
|
-
:class:`mindspore.train.RunContext` for detail.
|
|
313
|
-
|
|
314
|
-
"""
|
|
315
|
-
|
|
316
|
-
if self.enable is False:
|
|
317
|
-
return
|
|
318
|
-
pp_num = _get_auto_parallel_context("pipeline_stages")
|
|
319
|
-
if pp_num < 2:
|
|
320
|
-
self.enable = False
|
|
321
|
-
return
|
|
322
|
-
cb_params = run_context.original_args()
|
|
323
|
-
if cb_params.dataset_sink_mode is True and cb_params.sink_size > 1:
|
|
324
|
-
self.enable = False
|
|
325
|
-
return
|
|
326
|
-
if self.has_init is False:
|
|
327
|
-
self.has_init = True
|
|
328
|
-
self._init_ttp(run_context)
|
|
329
|
-
_flush_from_cache(cb_params)
|
|
330
|
-
cur_rank = get_rank()
|
|
331
|
-
append_dict = {}
|
|
332
|
-
append_dict["cur_epoch_num"] = cb_params.cur_epoch_num
|
|
333
|
-
append_dict["cur_step_num"] = int(
|
|
334
|
-
(cb_params.cur_step_num - 1) % cb_params.batch_num + 1)
|
|
335
|
-
append_dict["cur_rank"] = cur_rank
|
|
336
|
-
append_dict["batch_num"] = cb_params.batch_num
|
|
337
|
-
append_dict["global_step"] = cb_params.cur_step_num
|
|
338
|
-
|
|
339
|
-
save_params = _convert_net_to_param_list(cb_params.train_network)
|
|
340
|
-
save_params_copy = copy.deepcopy(save_params)
|
|
341
|
-
|
|
342
|
-
logger.info("Set ckpt args to TTP.")
|
|
343
|
-
self.ttp.ttp_set_ckpt_args(
|
|
344
|
-
(self._ckpt_save_path, save_params_copy, append_dict))
|
|
345
|
-
logger.info("Set optimizer finish step status to TTP.")
|
|
346
|
-
self.ttp.ttp_end_updating_os(cb_params.cur_step_num)
|
|
347
|
-
|
|
348
|
-
@staticmethod
|
|
349
|
-
def load_checkpoint_with_backup(ckpt_file_path, strategy_file_path, net):
|
|
350
|
-
"""
|
|
351
|
-
Load checkpoint into network, and use strategy file to find backup checkpoint file
|
|
352
|
-
when origin checkpoint file not found.
|
|
353
|
-
|
|
354
|
-
Note:
|
|
355
|
-
This API must be called after the communication is initialized because the cluster information
|
|
356
|
-
needs to be obtained internally.
|
|
357
|
-
|
|
358
|
-
Args:
|
|
359
|
-
ckpt_file_path (str): the checkpoint file to be loaded.
|
|
360
|
-
strategy_file_path (str): strategy file path for current rank.
|
|
361
|
-
net (Cell): network that needs to load checkpoint.
|
|
362
|
-
|
|
363
|
-
Returns:
|
|
364
|
-
Dict, checkpoint weights after loaded.
|
|
365
|
-
|
|
366
|
-
Raises:
|
|
367
|
-
ValueError: Failed to load the checkpoint file.
|
|
368
|
-
|
|
369
|
-
Examples:
|
|
370
|
-
>>> import numpy as np
|
|
371
|
-
>>> from mindspore import nn
|
|
372
|
-
>>> from mindspore.train import Model, MindIOTTPAdapter
|
|
373
|
-
>>> from mindspore import dataset as ds
|
|
374
|
-
>>> ms.set_context(mode=ms.GRAPH_MODE)
|
|
375
|
-
>>> ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True)
|
|
376
|
-
>>> init()
|
|
377
|
-
>>> ms.set_seed(1)
|
|
378
|
-
>>> class Network(nn.Cell):
|
|
379
|
-
... def __init__(self):
|
|
380
|
-
... super().__init__()
|
|
381
|
-
... self.flatten = nn.Flatten()
|
|
382
|
-
... self.fc = nn.Dense(28*28, 10, weight_init="normal", bias_init="zeros")
|
|
383
|
-
... self.relu = nn.ReLU()
|
|
384
|
-
...
|
|
385
|
-
... def construct(self, x):
|
|
386
|
-
... x = self.flatten(x)
|
|
387
|
-
... logits = self.relu(self.fc(x))
|
|
388
|
-
... return logits
|
|
389
|
-
>>>
|
|
390
|
-
>>> net = Network()
|
|
391
|
-
>>>
|
|
392
|
-
>>> def create_dataset(batch_size):
|
|
393
|
-
... dataset_path = os.getenv("DATA_PATH")
|
|
394
|
-
... rank_id = get_rank()
|
|
395
|
-
... rank_size = get_group_size()
|
|
396
|
-
... dataset = ds.MnistDataset(dataset_path, num_shards=rank_size, shard_id=rank_id)
|
|
397
|
-
... image_transforms = [
|
|
398
|
-
... ds.vision.Rescale(1.0 / 255.0, 0),
|
|
399
|
-
... ds.vision.Normalize(mean=(0.1307,), std=(0.3081,)),
|
|
400
|
-
... ds.vision.HWC2CHW()
|
|
401
|
-
... ]
|
|
402
|
-
... label_transform = ds.transforms.TypeCast(ms.int32)
|
|
403
|
-
... dataset = dataset.map(image_transforms, 'image')
|
|
404
|
-
... dataset = dataset.map(label_transform, 'label')
|
|
405
|
-
... dataset = dataset.batch(batch_size)
|
|
406
|
-
... return dataset
|
|
407
|
-
>>> data_set = create_dataset(32)
|
|
408
|
-
>>> ckpt_file= "./rank_5/iteration-1_40.ckpt"
|
|
409
|
-
>>> strategy_file = "./src_pipeline_strategys/src_strategy_5.ckpt"
|
|
410
|
-
>>> param_dict = MindIOTTPAdapter.load_checkpoint_with_backup(ckpt_file, stragegy_file, net)
|
|
411
|
-
>>> data_set.set_init_step(param_dict["global_step"])
|
|
412
|
-
"""
|
|
413
|
-
logger.info("Start load checkpoint with strategy file.")
|
|
414
|
-
try:
|
|
415
|
-
param_dict = ms.load_checkpoint(ckpt_file_path)
|
|
416
|
-
except ValueError as e:
|
|
417
|
-
logger.warning(
|
|
418
|
-
"Loading origin checkpoint file failed, the reason is:{}.".format(str(e)))
|
|
419
|
-
dp = _get_dp_from_layout(strategy_file_path)
|
|
420
|
-
rank = get_rank()
|
|
421
|
-
logger.info(
|
|
422
|
-
"Can't load origin checkpoint file, found dp:{}.".format(dp))
|
|
423
|
-
for i in dp:
|
|
424
|
-
if i == rank:
|
|
425
|
-
continue
|
|
426
|
-
new_ckpt = ckpt_file_path.replace(
|
|
427
|
-
f"/rank_{rank}/", f"/rank_{str(i)}/")
|
|
428
|
-
if not os.path.isfile(new_ckpt):
|
|
429
|
-
continue
|
|
430
|
-
try:
|
|
431
|
-
param_dict = ms.load_checkpoint(new_ckpt)
|
|
432
|
-
except ValueError as e1:
|
|
433
|
-
logger.warning(
|
|
434
|
-
"Loading strategy checkpoint file failed, the reason is:{}.".format(str(e1)))
|
|
435
|
-
param_dict = None
|
|
436
|
-
if param_dict:
|
|
437
|
-
logger.info("Found param dict, load it into network.")
|
|
438
|
-
ms.load_param_into_net(net, param_dict)
|
|
439
|
-
else:
|
|
440
|
-
raise ValueError(
|
|
441
|
-
"Load checkpoint file failed, please check your config is set correctly.")
|
|
442
|
-
logger.info("Finish load checkpoint with strategy file.")
|
|
443
|
-
return param_dict
|
|
File without changes
|
|
File without changes
|
|
File without changes
|