mindspore 2.3.0__cp39-cp39-win_amd64.whl → 2.4.0__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (285) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/__init__.py +3 -1
  3. mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
  4. mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
  5. mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
  6. mindspore/_checkparam.py +50 -9
  7. mindspore/_extends/parse/compile_config.py +41 -0
  8. mindspore/_extends/parse/parser.py +9 -7
  9. mindspore/_extends/parse/standard_method.py +52 -14
  10. mindspore/_extends/pijit/pijit_func_white_list.py +350 -24
  11. mindspore/amp.py +24 -10
  12. mindspore/avcodec-59.dll +0 -0
  13. mindspore/avdevice-59.dll +0 -0
  14. mindspore/avfilter-8.dll +0 -0
  15. mindspore/avformat-59.dll +0 -0
  16. mindspore/avutil-57.dll +0 -0
  17. mindspore/common/__init__.py +6 -4
  18. mindspore/common/_pijit_context.py +190 -0
  19. mindspore/common/_register_for_tensor.py +2 -1
  20. mindspore/common/_tensor_overload.py +139 -0
  21. mindspore/common/api.py +102 -87
  22. mindspore/common/dump.py +5 -6
  23. mindspore/common/generator.py +1 -7
  24. mindspore/common/hook_handle.py +14 -26
  25. mindspore/common/mindir_util.py +2 -2
  26. mindspore/common/parameter.py +46 -13
  27. mindspore/common/recompute.py +39 -9
  28. mindspore/common/sparse_tensor.py +7 -3
  29. mindspore/common/tensor.py +209 -29
  30. mindspore/communication/__init__.py +1 -1
  31. mindspore/communication/_comm_helper.py +38 -3
  32. mindspore/communication/comm_func.py +310 -55
  33. mindspore/communication/management.py +14 -14
  34. mindspore/context.py +123 -22
  35. mindspore/dataset/__init__.py +1 -1
  36. mindspore/dataset/audio/__init__.py +1 -1
  37. mindspore/dataset/core/config.py +7 -0
  38. mindspore/dataset/core/validator_helpers.py +7 -0
  39. mindspore/dataset/engine/cache_client.py +1 -1
  40. mindspore/dataset/engine/datasets.py +72 -44
  41. mindspore/dataset/engine/datasets_audio.py +7 -7
  42. mindspore/dataset/engine/datasets_standard_format.py +53 -3
  43. mindspore/dataset/engine/datasets_text.py +20 -20
  44. mindspore/dataset/engine/datasets_user_defined.py +174 -104
  45. mindspore/dataset/engine/datasets_vision.py +33 -33
  46. mindspore/dataset/engine/iterators.py +29 -0
  47. mindspore/dataset/engine/obs/util.py +7 -0
  48. mindspore/dataset/engine/queue.py +114 -60
  49. mindspore/dataset/engine/serializer_deserializer.py +2 -2
  50. mindspore/dataset/engine/validators.py +34 -14
  51. mindspore/dataset/text/__init__.py +1 -4
  52. mindspore/dataset/transforms/__init__.py +0 -3
  53. mindspore/dataset/utils/line_reader.py +2 -0
  54. mindspore/dataset/vision/__init__.py +1 -4
  55. mindspore/dataset/vision/utils.py +1 -1
  56. mindspore/dataset/vision/validators.py +2 -1
  57. mindspore/dnnl.dll +0 -0
  58. mindspore/{nn/extend → experimental/es}/__init__.py +4 -11
  59. mindspore/experimental/es/embedding_service.py +883 -0
  60. mindspore/{nn/layer → experimental/es}/embedding_service_layer.py +218 -30
  61. mindspore/experimental/llm_boost/__init__.py +21 -0
  62. mindspore/{nn/extend/layer → experimental/llm_boost/atb}/__init__.py +4 -8
  63. mindspore/experimental/llm_boost/atb/boost_base.py +211 -0
  64. mindspore/experimental/llm_boost/atb/llama_boost.py +115 -0
  65. mindspore/experimental/llm_boost/atb/qwen_boost.py +101 -0
  66. mindspore/experimental/llm_boost/register.py +129 -0
  67. mindspore/experimental/llm_boost/utils.py +31 -0
  68. mindspore/experimental/optim/adamw.py +85 -0
  69. mindspore/experimental/optim/optimizer.py +3 -0
  70. mindspore/hal/__init__.py +3 -3
  71. mindspore/hal/contiguous_tensors_handle.py +175 -0
  72. mindspore/hal/stream.py +18 -0
  73. mindspore/include/api/model_group.h +13 -1
  74. mindspore/include/api/types.h +10 -10
  75. mindspore/include/dataset/config.h +2 -2
  76. mindspore/include/dataset/constants.h +2 -2
  77. mindspore/include/dataset/execute.h +2 -2
  78. mindspore/include/dataset/vision.h +4 -0
  79. mindspore/jpeg62.dll +0 -0
  80. mindspore/log.py +1 -1
  81. mindspore/mindrecord/filewriter.py +68 -51
  82. mindspore/mindspore_backend.dll +0 -0
  83. mindspore/mindspore_common.dll +0 -0
  84. mindspore/mindspore_core.dll +0 -0
  85. mindspore/mindspore_glog.dll +0 -0
  86. mindspore/mindspore_np_dtype.dll +0 -0
  87. mindspore/mindspore_ops.dll +0 -0
  88. mindspore/mint/__init__.py +495 -46
  89. mindspore/mint/distributed/__init__.py +31 -0
  90. mindspore/mint/distributed/distributed.py +254 -0
  91. mindspore/mint/nn/__init__.py +266 -21
  92. mindspore/mint/nn/functional.py +125 -19
  93. mindspore/mint/nn/layer/__init__.py +39 -0
  94. mindspore/mint/nn/layer/activation.py +133 -0
  95. mindspore/mint/nn/layer/normalization.py +477 -0
  96. mindspore/mint/nn/layer/pooling.py +110 -0
  97. mindspore/mint/optim/adamw.py +28 -7
  98. mindspore/mint/special/__init__.py +63 -0
  99. mindspore/multiprocessing/__init__.py +2 -1
  100. mindspore/nn/__init__.py +0 -1
  101. mindspore/nn/cell.py +275 -93
  102. mindspore/nn/layer/activation.py +211 -44
  103. mindspore/nn/layer/basic.py +113 -3
  104. mindspore/nn/layer/embedding.py +120 -2
  105. mindspore/nn/layer/normalization.py +101 -5
  106. mindspore/nn/layer/padding.py +34 -48
  107. mindspore/nn/layer/pooling.py +161 -7
  108. mindspore/nn/layer/transformer.py +3 -3
  109. mindspore/nn/loss/__init__.py +2 -2
  110. mindspore/nn/loss/loss.py +84 -6
  111. mindspore/nn/optim/__init__.py +2 -1
  112. mindspore/nn/optim/adadelta.py +1 -1
  113. mindspore/nn/optim/adam.py +1 -1
  114. mindspore/nn/optim/lamb.py +1 -1
  115. mindspore/nn/optim/tft_wrapper.py +127 -0
  116. mindspore/nn/wrap/cell_wrapper.py +12 -23
  117. mindspore/nn/wrap/grad_reducer.py +5 -5
  118. mindspore/nn/wrap/loss_scale.py +17 -3
  119. mindspore/numpy/__init__.py +1 -1
  120. mindspore/numpy/array_creations.py +65 -68
  121. mindspore/numpy/array_ops.py +64 -60
  122. mindspore/numpy/fft.py +610 -75
  123. mindspore/numpy/logic_ops.py +11 -10
  124. mindspore/numpy/math_ops.py +85 -84
  125. mindspore/numpy/utils_const.py +4 -4
  126. mindspore/opencv_core452.dll +0 -0
  127. mindspore/opencv_imgcodecs452.dll +0 -0
  128. mindspore/opencv_imgproc452.dll +0 -0
  129. mindspore/ops/__init__.py +6 -4
  130. mindspore/ops/_grad_experimental/grad_comm_ops.py +47 -3
  131. mindspore/ops/_grad_experimental/grad_math_ops.py +0 -22
  132. mindspore/ops/_vmap/vmap_array_ops.py +2 -4
  133. mindspore/ops/_vmap/vmap_math_ops.py +17 -1
  134. mindspore/ops/_vmap/vmap_nn_ops.py +43 -2
  135. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +85 -7
  136. mindspore/ops/auto_generate/gen_arg_dtype_cast.py +2 -0
  137. mindspore/ops/auto_generate/gen_extend_func.py +734 -13
  138. mindspore/ops/auto_generate/gen_ops_def.py +2420 -381
  139. mindspore/ops/auto_generate/gen_ops_prim.py +5196 -1659
  140. mindspore/ops/auto_generate/pyboost_inner_prim.py +176 -56
  141. mindspore/ops/composite/base.py +85 -48
  142. mindspore/ops/composite/multitype_ops/_compile_utils.py +1 -0
  143. mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -2
  144. mindspore/ops/function/__init__.py +22 -0
  145. mindspore/ops/function/array_func.py +490 -153
  146. mindspore/ops/function/debug_func.py +113 -1
  147. mindspore/ops/function/fft_func.py +15 -2
  148. mindspore/ops/function/grad/grad_func.py +3 -2
  149. mindspore/ops/function/math_func.py +558 -207
  150. mindspore/ops/function/nn_func.py +817 -383
  151. mindspore/ops/function/other_func.py +3 -2
  152. mindspore/ops/function/random_func.py +184 -8
  153. mindspore/ops/function/reshard_func.py +13 -11
  154. mindspore/ops/function/sparse_unary_func.py +1 -1
  155. mindspore/ops/function/vmap_func.py +3 -2
  156. mindspore/ops/functional.py +24 -14
  157. mindspore/ops/op_info_register.py +3 -3
  158. mindspore/ops/operations/__init__.py +6 -1
  159. mindspore/ops/operations/_grad_ops.py +2 -76
  160. mindspore/ops/operations/_infer_ops.py +1 -1
  161. mindspore/ops/operations/_inner_ops.py +71 -94
  162. mindspore/ops/operations/array_ops.py +12 -146
  163. mindspore/ops/operations/comm_ops.py +42 -53
  164. mindspore/ops/operations/custom_ops.py +83 -19
  165. mindspore/ops/operations/debug_ops.py +42 -10
  166. mindspore/ops/operations/manually_defined/_inner.py +12 -0
  167. mindspore/ops/operations/manually_defined/ops_def.py +265 -10
  168. mindspore/ops/operations/math_ops.py +12 -223
  169. mindspore/ops/operations/nn_ops.py +20 -114
  170. mindspore/ops/operations/other_ops.py +7 -4
  171. mindspore/ops/operations/random_ops.py +46 -1
  172. mindspore/ops/primitive.py +18 -6
  173. mindspore/ops_generate/arg_dtype_cast.py +2 -0
  174. mindspore/ops_generate/gen_aclnn_implement.py +11 -11
  175. mindspore/ops_generate/gen_constants.py +36 -0
  176. mindspore/ops_generate/gen_ops.py +67 -52
  177. mindspore/ops_generate/gen_ops_inner_prim.py +1 -1
  178. mindspore/ops_generate/gen_pyboost_func.py +131 -47
  179. mindspore/ops_generate/op_proto.py +10 -3
  180. mindspore/ops_generate/pyboost_utils.py +14 -1
  181. mindspore/ops_generate/template.py +43 -21
  182. mindspore/parallel/__init__.py +3 -1
  183. mindspore/parallel/_auto_parallel_context.py +28 -8
  184. mindspore/parallel/_cell_wrapper.py +83 -0
  185. mindspore/parallel/_parallel_serialization.py +47 -19
  186. mindspore/parallel/_tensor.py +81 -11
  187. mindspore/parallel/_utils.py +13 -1
  188. mindspore/parallel/algo_parameter_config.py +5 -5
  189. mindspore/parallel/checkpoint_transform.py +46 -39
  190. mindspore/parallel/cluster/process_entity/__init__.py +1 -1
  191. mindspore/parallel/cluster/process_entity/_api.py +31 -23
  192. mindspore/parallel/cluster/process_entity/_utils.py +2 -27
  193. mindspore/parallel/parameter_broadcast.py +3 -4
  194. mindspore/parallel/shard.py +162 -31
  195. mindspore/parallel/transform_safetensors.py +993 -0
  196. mindspore/profiler/__init__.py +2 -1
  197. mindspore/profiler/common/constant.py +29 -0
  198. mindspore/profiler/common/registry.py +47 -0
  199. mindspore/profiler/common/util.py +28 -0
  200. mindspore/profiler/dynamic_profiler.py +694 -0
  201. mindspore/profiler/envprofiling.py +17 -19
  202. mindspore/profiler/parser/ascend_analysis/constant.py +18 -0
  203. mindspore/profiler/parser/ascend_analysis/file_manager.py +25 -4
  204. mindspore/profiler/parser/ascend_analysis/function_event.py +43 -19
  205. mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +31 -26
  206. mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +56 -10
  207. mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +55 -8
  208. mindspore/profiler/parser/ascend_analysis/path_manager.py +313 -0
  209. mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +27 -20
  210. mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +9 -2
  211. mindspore/profiler/parser/ascend_msprof_exporter.py +5 -4
  212. mindspore/profiler/parser/ascend_timeline_generator.py +27 -25
  213. mindspore/profiler/parser/base_timeline_generator.py +19 -25
  214. mindspore/profiler/parser/cpu_gpu_timeline_generator.py +25 -12
  215. mindspore/profiler/parser/framework_parser.py +1 -391
  216. mindspore/profiler/parser/gpu_analysis/__init__.py +14 -0
  217. mindspore/profiler/parser/gpu_analysis/function_event.py +44 -0
  218. mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +89 -0
  219. mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +72 -0
  220. mindspore/profiler/parser/memory_usage_parser.py +0 -154
  221. mindspore/profiler/parser/profiler_info.py +78 -6
  222. mindspore/profiler/profiler.py +153 -0
  223. mindspore/profiler/profiling.py +280 -412
  224. mindspore/rewrite/__init__.py +1 -2
  225. mindspore/rewrite/common/namespace.py +4 -4
  226. mindspore/rewrite/symbol_tree/symbol_tree.py +3 -3
  227. mindspore/run_check/_check_version.py +36 -103
  228. mindspore/safeguard/rewrite_obfuscation.py +591 -247
  229. mindspore/swresample-4.dll +0 -0
  230. mindspore/swscale-6.dll +0 -0
  231. mindspore/tinyxml2.dll +0 -0
  232. mindspore/train/__init__.py +4 -3
  233. mindspore/train/_utils.py +28 -2
  234. mindspore/train/amp.py +171 -53
  235. mindspore/train/callback/__init__.py +2 -2
  236. mindspore/train/callback/_callback.py +4 -4
  237. mindspore/train/callback/_checkpoint.py +85 -22
  238. mindspore/train/callback/_cluster_monitor.py +1 -1
  239. mindspore/train/callback/_flops_collector.py +1 -0
  240. mindspore/train/callback/_loss_monitor.py +3 -3
  241. mindspore/train/callback/_on_request_exit.py +134 -31
  242. mindspore/train/callback/_summary_collector.py +5 -5
  243. mindspore/train/callback/_tft_register.py +352 -0
  244. mindspore/train/dataset_helper.py +7 -3
  245. mindspore/train/metrics/metric.py +3 -3
  246. mindspore/train/metrics/roc.py +4 -4
  247. mindspore/train/mind_ir_pb2.py +44 -39
  248. mindspore/train/model.py +134 -58
  249. mindspore/train/serialization.py +336 -112
  250. mindspore/turbojpeg.dll +0 -0
  251. mindspore/utils/__init__.py +21 -0
  252. mindspore/utils/utils.py +60 -0
  253. mindspore/version.py +1 -1
  254. {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/METADATA +6 -2
  255. {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/RECORD +258 -252
  256. mindspore/include/c_api/ms/abstract.h +0 -67
  257. mindspore/include/c_api/ms/attribute.h +0 -197
  258. mindspore/include/c_api/ms/base/handle_types.h +0 -43
  259. mindspore/include/c_api/ms/base/macros.h +0 -32
  260. mindspore/include/c_api/ms/base/status.h +0 -33
  261. mindspore/include/c_api/ms/base/types.h +0 -283
  262. mindspore/include/c_api/ms/context.h +0 -102
  263. mindspore/include/c_api/ms/graph.h +0 -160
  264. mindspore/include/c_api/ms/node.h +0 -606
  265. mindspore/include/c_api/ms/tensor.h +0 -161
  266. mindspore/include/c_api/ms/value.h +0 -84
  267. mindspore/mindspore_shared_lib.dll +0 -0
  268. mindspore/nn/extend/basic.py +0 -140
  269. mindspore/nn/extend/embedding.py +0 -143
  270. mindspore/nn/extend/layer/normalization.py +0 -109
  271. mindspore/nn/extend/pooling.py +0 -117
  272. mindspore/nn/layer/embedding_service.py +0 -531
  273. mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +0 -93
  274. mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +0 -66
  275. mindspore/ops/extend/__init__.py +0 -53
  276. mindspore/ops/extend/array_func.py +0 -218
  277. mindspore/ops/extend/math_func.py +0 -76
  278. mindspore/ops/extend/nn_func.py +0 -308
  279. mindspore/ops/silent_check.py +0 -162
  280. mindspore/profiler/parser/msadvisor_analyzer.py +0 -82
  281. mindspore/profiler/parser/msadvisor_parser.py +0 -240
  282. mindspore/train/callback/_mindio_ttp.py +0 -443
  283. {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/WHEEL +0 -0
  284. {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/entry_points.txt +0 -0
  285. {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/top_level.txt +0 -0
@@ -1,240 +0,0 @@
1
- # Copyright 2022 Huawei Technologies Co., Ltd
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- # ============================================================================
15
- """
16
- MSAdvisor AICPU model parser.
17
- """
18
-
19
- import os
20
- import stat
21
- import shutil
22
- import json
23
-
24
- from mindspore import log as logger
25
- from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException
26
- from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
27
-
28
-
29
- MIN_TO_US = 60000000 # 1 min to us
30
- MS_TO_US = 1000 # 1 ms to us
31
- AICPU_STREAM_ID = 9000 # aicpu stream id in profiler
32
-
33
-
34
- class MsadvisorParser:
35
- """
36
- Data format conversion for MSAdvisor AICPU model.
37
- """
38
-
39
- def __init__(self, job_id, device_id, rank_id, output_path, pretty=False):
40
- self._job_id = job_id
41
- self._device_id = device_id
42
- self._rank_id = str(rank_id)
43
- self._output_path = output_path
44
- self._aicore_path = ""
45
- self._aicpu_path = ""
46
- self._time_start = 0
47
- self._time_end = 0
48
- self._pretty = pretty
49
-
50
- @property
51
- def indent(self):
52
- indent = 1 if self._pretty else None
53
- return indent
54
-
55
- @staticmethod
56
- def check_clear_make_dir(dir_path):
57
- """
58
- Check if dir exists, then clear the dir and make a new dir.
59
-
60
- Args:
61
- dir_path (str): dir path is needed to clear and make.
62
-
63
- Return:
64
- str, new dir path.
65
- """
66
- dir_path = validate_and_normalize_path(dir_path)
67
- if os.path.exists(dir_path):
68
- shutil.rmtree(dir_path)
69
- os.makedirs(dir_path, stat.S_IRWXU)
70
- return dir_path
71
-
72
- @staticmethod
73
- def generate_aicore_json(aicore_info, tid):
74
- """
75
- Generate dict of operation information which be dumped into json file.
76
-
77
- Args:
78
- aicore_info (str): str read from aicore timeline file.
79
- tid (int): Task Id.
80
-
81
- Return:
82
- dict, dict of operation information which can be dumped into json file.
83
-
84
- Raises:
85
- ValueError: If the value of aicore attrributes cannot be converted to float.
86
- """
87
- op = aicore_info.split(",")
88
- name = op[0]
89
- pid = 0
90
- tid = tid - 1
91
- task_type = "AI_CORE"
92
-
93
- try:
94
- ts, dur, sid = float(op[2]) * MS_TO_US, float(op[3]) * MS_TO_US, float(op[1])
95
- except ValueError as err:
96
- logger.warning("The aicore timeline file content is abnormal. Failed to format aicore timeline file")
97
- raise err
98
- finally:
99
- pass
100
-
101
- op = {
102
- "name": name, "pid": pid, "ts": ts, "dur": dur,
103
- "args": {"Task Type": task_type, "Stream Id": sid, "Task Id": tid}, "ph": "X"
104
- }
105
- return op
106
-
107
- @staticmethod
108
- def generate_aicpu_json(aicpu_info, tid):
109
- """
110
- Generate dict of operation information which be dumped into json file.
111
-
112
- Args:
113
- aicpu_info (str): str read from aicpu timeline file.
114
- tid (int): Task Id.
115
-
116
- Return:
117
- dict, dict of operation information which can be dumped into json file.
118
-
119
- Raises:
120
- ValueError: If the value of aicpu attrributes cannot be converted to float.
121
- """
122
- op = aicpu_info.split(",")
123
- name = op[1]
124
- pid = 1
125
- sid = AICPU_STREAM_ID
126
- tid = tid - 1
127
- task_type = "AI_CPU"
128
-
129
- try:
130
- ts = float(op[5])
131
- dur = float(op[4]) * MS_TO_US
132
- except ValueError as err:
133
- logger.warning("The aicpu timeline file content is abnormal. Failed to format aicpu timeline file")
134
- raise err
135
- finally:
136
- pass
137
-
138
- op = {
139
- "name": name, "pid": pid, "ts": ts, "dur": dur,
140
- "args": {"Task Type": task_type, "Stream Id": sid, "Task Id": tid}, "ph": "X"
141
- }
142
- return op
143
-
144
- def get_input_file(self):
145
- """
146
- Get aicore and aicpu information file from specific path and rank id.
147
-
148
- Raises:
149
- ProfilerFileNotFoundException: If aicore timeline file does not exist.
150
- ProfilerFileNotFoundException: If aicpu timeline file does not exist.
151
- """
152
- self._aicore_path = "output_timeline_data_{}.txt".format(self._rank_id)
153
- self._aicore_path = os.path.join(self._output_path, self._aicore_path)
154
- self._aicore_path = validate_and_normalize_path(self._aicore_path)
155
-
156
- self._aicpu_path = "aicpu_intermediate_{}.csv".format(self._rank_id)
157
- self._aicpu_path = os.path.join(self._output_path, self._aicpu_path)
158
- self._aicpu_path = validate_and_normalize_path(self._aicpu_path)
159
-
160
- if not os.path.exists(self._aicore_path):
161
- logger.warning('The aicore timeline file does not exist!')
162
- raise ProfilerFileNotFoundException(msg=self._aicore_path)
163
- if not os.path.exists(self._aicpu_path):
164
- logger.warning('The aicpu timeline file does not exist!')
165
- raise ProfilerFileNotFoundException(msg=self._aicpu_path)
166
-
167
- def get_output_file(self):
168
- """
169
- Get output path needed by MSAdvisor and created dir.
170
- """
171
- msprof_file = os.path.join(self._output_path, "msadvisor")
172
- msprof_file = os.path.join(msprof_file, "device_" + self._rank_id)
173
- msprof_file = os.path.join(msprof_file, "profiling")
174
- msprof_file = MsadvisorParser.check_clear_make_dir(msprof_file)
175
-
176
- msprof_file = os.path.join(msprof_file, self._job_id)
177
- msprof_file = os.path.join(msprof_file, "device_0", "timeline")
178
- msprof_file = validate_and_normalize_path(msprof_file)
179
- os.makedirs(msprof_file, stat.S_IRWXU)
180
-
181
- msprof_file = os.path.join(msprof_file, "task_time_0_1_1.json")
182
- self._output_path = msprof_file
183
-
184
- def write_aicore(self):
185
- """
186
- Read aicore information from file created by profiler and generate new file needed by MSAdvisor.
187
- """
188
- aicore_file = self._aicore_path
189
- output_file = self._output_path
190
-
191
- with os.fdopen(os.open(output_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
192
- stat.S_IRUSR | stat.S_IWUSR), "w") as output_file:
193
- output_file.write("[")
194
- with os.fdopen(os.open(aicore_file, os.O_RDONLY,
195
- stat.S_IRUSR | stat.S_IWUSR), "r") as aicore_file:
196
- for tid, aicore in enumerate(aicore_file):
197
- if tid == 0:
198
- continue
199
- op = MsadvisorParser.generate_aicore_json(aicore, tid)
200
- if tid == 1:
201
- self._time_start = op.get("ts")
202
- total_duration = op.get("ts") - self._time_start
203
- if total_duration > 1 * MIN_TO_US or tid > 10000:
204
- self._time_end = op.get("ts")
205
- break
206
- if tid > 1:
207
- output_file.write(",")
208
- json.dump(op, output_file, indent=self.indent)
209
-
210
- def write_aicpu(self):
211
- """
212
- Read aicpu information from file created by profiler and write into new file needed by MSAdvisor.
213
- """
214
- aicpu_file = self._aicpu_path
215
- output_file = self._output_path
216
-
217
- with os.fdopen(os.open(output_file, os.O_WRONLY | os.O_APPEND,
218
- stat.S_IRUSR | stat.S_IWUSR), "a") as output_file:
219
- with os.fdopen(os.open(aicpu_file, os.O_RDONLY,
220
- stat.S_IRUSR | stat.S_IWUSR), "r") as aicpu_file:
221
- for tid, aicpu in enumerate(aicpu_file):
222
- if tid == 0:
223
- continue
224
- op = MsadvisorParser.generate_aicpu_json(aicpu, tid)
225
- if op is None:
226
- continue
227
- if op.get("ts") > self._time_end:
228
- break
229
- output_file.write(",")
230
- json.dump(op, output_file, indent=self.indent)
231
- output_file.write("]")
232
-
233
- def parse(self):
234
- """
235
- Interface to call all function in the class. Generated data for AICpu model in MSAdvisor.
236
- """
237
- self.get_input_file()
238
- self.get_output_file()
239
- self.write_aicore()
240
- self.write_aicpu()
@@ -1,443 +0,0 @@
1
- # Copyright 2024 Huawei Technologies Co., Ltd
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- # ============================================================================
15
- """Checkpoint related classes and functions."""
16
-
17
- import os
18
- import sys
19
- import copy
20
- from mindspore.train.serialization import save_checkpoint, _convert_cell_param_and_names_to_dict, _get_merged_param_data
21
- from mindspore.parallel._auto_parallel_context import _get_auto_parallel_context
22
- from mindspore.parallel._utils import _get_device_num
23
- from mindspore import _checkparam as Validator
24
- from mindspore.train.callback._callback import Callback
25
- from mindspore.common.tensor import Tensor
26
- from mindspore import context
27
- import mindspore as ms
28
- from mindspore.communication import get_rank
29
- from mindspore.parallel.checkpoint_transform import sync_pipeline_shared_parameters
30
-
31
- from mindspore.train._utils import get_parameter_redundancy
32
- from mindspore import log as logger
33
- from mindspore.parallel._utils import _is_in_auto_parallel_mode
34
- from mindspore.common.api import _get_parameter_layout
35
-
36
-
37
- def _get_dp_from_layout(parameter_layout_dict):
38
- """ Get dp and tp from layout dict. """
39
- pp_num = _get_auto_parallel_context("pipeline_stages")
40
- dev_num = _get_device_num()
41
- global_rank = get_rank()
42
- pipe_size = dev_num // pp_num
43
- initial_rank = (global_rank // pipe_size) * pipe_size
44
- parameter_redundancy_dict = get_parameter_redundancy(
45
- parameter_layout_dict, initial_rank)
46
- value_len = sys.maxsize
47
- min_value = ()
48
- for key, value in parameter_redundancy_dict.items():
49
- if "accu_grads" in key or "inputs" in key:
50
- continue
51
- for item in value:
52
- if len(item) < value_len and global_rank in item:
53
- value_len = len(item)
54
- min_value = item
55
- return min_value
56
-
57
-
58
- def _get_ckpt_dir(append_dict, ckpt_save_path, is_tmp_file):
59
- """ Common func to generate ckpt dir name."""
60
- tmp = "_tmp" if is_tmp_file else ""
61
- mid_dir = f"ttp_saved_checkpoints-{str(append_dict['cur_epoch_num'])}_{str(append_dict['cur_step_num'])}{tmp}"
62
- return os.path.join(ckpt_save_path, mid_dir)
63
-
64
-
65
- def _flush_from_cache(cb_params):
66
- """ Flush cache data to host if tensor is cache enable."""
67
- params = cb_params.train_network.get_parameters()
68
- for param in params:
69
- if param.cache_enable:
70
- Tensor(param).flush_from_cache()
71
-
72
-
73
- def _save_checkpoint_on_failure(save_rank, step, rank_list, save_args):
74
- """ Callback used for TTP save ckpt function when errors occur."""
75
- logger.info("Enter _save_checkpoint_on_failure function")
76
- ckpt_save_path, save_params, append_dict = save_args
77
- ckpt_file = f"iteration-{str(append_dict['cur_epoch_num'])}_{str(append_dict['cur_step_num'])}.ckpt"
78
- cur_ckpt_dir = _get_ckpt_dir(
79
- append_dict, ckpt_save_path, True) + "/rank_" + str(save_rank)
80
- os.makedirs(cur_ckpt_dir)
81
- cur_file = os.path.join(cur_ckpt_dir, ckpt_file)
82
- save_checkpoint(save_params, cur_file,
83
- integrated_save=False, append_dict=append_dict)
84
- logger.info("Finish _save_checkpoint_on_failure function")
85
-
86
-
87
- def _convert_net_to_param_list(save_obj):
88
- """Convert nn.Cell to param_list."""
89
- sync_pipeline_shared_parameters(save_obj)
90
- param_list = []
91
- parameter_layout_dict = save_obj.parameter_layout_dict
92
- if _is_in_auto_parallel_mode() and not parameter_layout_dict:
93
- parameter_layout_dict = _get_parameter_layout()
94
- if not _is_in_auto_parallel_mode():
95
- save_obj.init_parameters_data()
96
- param_dict = _convert_cell_param_and_names_to_dict(save_obj, None)
97
- for (key, value) in param_dict.items():
98
- each_param = {"name": key}
99
- param_data = Tensor(value.asnumpy())
100
- # in automatic model parallel scenario, some parameters were split to all the devices,
101
- # which should be combined before saving
102
- if key in parameter_layout_dict:
103
- param_data = _get_merged_param_data(
104
- save_obj, parameter_layout_dict, key, param_data, False)
105
- each_param["data"] = param_data
106
- param_list.append(each_param)
107
- return param_list
108
-
109
-
110
- def _rename_save_result(rename_args):
111
- """ Callback used for TTP rename function after ckpt save callback was finished and successful."""
112
- logger.info("Enter _rename_save_result function")
113
- ckpt_save_path, _, append_dict = rename_args
114
-
115
- tmp_dir = _get_ckpt_dir(append_dict, ckpt_save_path, True)
116
- fin_dir = _get_ckpt_dir(append_dict, ckpt_save_path, False)
117
-
118
- os.rename(tmp_dir, fin_dir)
119
- logger.info("Finish _rename_save_result function")
120
-
121
-
122
- class MindIOTTPAdapter(Callback):
123
- """
124
- This callback is used to enable the feature
125
- `MindIO TTP <https://www.hiascend.com/document/detail/zh/mindx-dl/60rc1/mindio/mindiottp/mindiottp001.html>`_.
126
- This callback will execute TTP operations during training process, such as TTP init, report and exception handle.
127
-
128
- Note:
129
- Required for Ascend GE LazyInline mode only. And pipline size must be greater than 1.
130
-
131
- Args:
132
- controller_ip (str): TTP controller's ip address, used for init TTP controller.
133
- controller_port (int): TTP controller's ip port, used for init TTP controller and processor.
134
- ckpt_save_path (str): Checkpoint save directory when failure occurs, checkpoint file will save to directory
135
- named ttp_saved_checkpoints-{cur_epoch_num}_{cur_step_num} under this directory.
136
-
137
- Raises:
138
- Exception: TTP init failed.
139
- ModuleNotFoundError: Mindio TTP whl package is not installed.
140
-
141
- Examples:
142
- >>> import numpy as np
143
- >>> import os
144
- >>> import math
145
- >>> import mindspore as ms
146
- >>> import mindspore.dataset as ds
147
- >>> from mindspore import nn, ops, Parameter, train
148
- >>> from mindspore.communication import init
149
- >>> from mindspore.common.initializer import initializer, HeUniform
150
- >>> from mindspore.train import Model, MindIOTTPAdapter
151
- >>> from mindspore import dataset as ds
152
- >>> ms.set_context(mode=ms.GRAPH_MODE, jit_level='O2')
153
- >>> ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL, pipeline_stages=2)
154
- >>> init()
155
- >>> ms.set_seed(1)
156
- >>> ms.set_auto_parallel_context(strategy_ckpt_config={"save_file":
157
- >>> "./src_pipeline_strategys/src_strategy_{}.ckpt".format(get_rank())})
158
- >>> class MatMulCell(nn.Cell):
159
- ... def __init__(self, param=None, shape=None):
160
- ... super().__init__()
161
- ... if shape is None:
162
- ... shape = [28 * 28, 512]
163
- ... weight_init = HeUniform(math.sqrt(5))
164
- ... self.param = Parameter(initializer(weight_init, shape), name="param")
165
- ... if param is not None:
166
- ... self.param = param
167
- ... self.print = ops.Print()
168
- ... self.matmul = ops.MatMul()
169
- ...
170
- ... def construct(self, x):
171
- ... out = self.matmul(x, self.param)
172
- ... self.print("out is:", out)
173
- ... return out
174
- >>>
175
- >>> class Network(nn.Cell):
176
- ... def __init__(self):
177
- ... super().__init__()
178
- ... self.flatten = nn.Flatten()
179
- ... self.layer1 = MatMulCell()
180
- ... self.relu1 = nn.ReLU()
181
- ... self.layer2 = nn.Dense(512, 512)
182
- ... self.relu2 = nn.ReLU()
183
- ... self.layer3 = nn.Dense(512, 10)
184
- ...
185
- ... def construct(self, x):
186
- ... x = self.flatten(x)
187
- ... x = self.layer1(x)
188
- ... x = self.relu1(x)
189
- ... x = self.layer2(x)
190
- ... x = self.relu2(x)
191
- ... logits = self.layer3(x)
192
- ... return logits
193
- >>>
194
- >>> net = Network()
195
- >>> net.layer1.pipeline_stage = 0
196
- >>> net.relu1.pipeline_stage = 0
197
- >>> net.layer2.pipeline_stage = 0
198
- >>> net.relu2.pipeline_stage = 1
199
- >>> net.layer3.pipeline_stage = 1
200
- >>>
201
- >>> def create_dataset(batch_size):
202
- ... dataset_path = os.getenv("DATA_PATH")
203
- ... dataset = ds.MnistDataset(dataset_path)
204
- ... image_transforms = [
205
- ... ds.vision.Rescale(1.0 / 255.0, 0),
206
- ... ds.vision.Normalize(mean=(0.1307,), std=(0.3081,)),
207
- ... ds.vision.HWC2CHW()
208
- ... ]
209
- ... label_transform = ds.transforms.TypeCast(ms.int32)
210
- ... dataset = dataset.map(image_transforms, 'image')
211
- ... dataset = dataset.map(label_transform, 'label')
212
- ... dataset = dataset.batch(batch_size)
213
- ... return dataset
214
- >>>
215
- >>> data_set = create_dataset(32)
216
- >>>
217
- >>> optimizer = nn.SGD(net.trainable_params(), 1e-2)
218
- >>> loss_fn = nn.CrossEntropyLoss()
219
- >>>
220
- >>> net_with_loss = nn.PipelineCell(nn.WithLossCell(net, loss_fn), 4)
221
- >>> net_with_loss.set_train()
222
- >>> model = Model(net_with_loss, optimizer=optimizer)
223
- >>> ttp_cb = MindIOTTPAdapter("192.168.0.1", 2000, "./ttp_checkpoint/")
224
- >>> loss_cb = train.LossMonitor(1)
225
- >>> model.train(1, dataset, callbacks=[ttp_cb, loss_cb])
226
- """
227
-
228
- def __init__(self, controller_ip, controller_port, ckpt_save_path):
229
- super(MindIOTTPAdapter, self).__init__()
230
- # let it raises errors if not install mindio_ttp package
231
- from mindio_ttp import framework_ttp as ttp
232
- self.ttp = ttp
233
- Validator.check_non_negative_int(controller_port)
234
- self.has_init = False
235
- self.enable = False
236
- mode = context.get_context("mode")
237
- if context.get_context("device_target") != "Ascend" or mode != context.GRAPH_MODE:
238
- logger.warning(
239
- "MindIO adataper only support on Ascend device with GRAPH Mode.")
240
- return
241
- if os.getenv("MS_ENABLE_MINDIO_GRACEFUL_EXIT") != "true":
242
- logger.warning("MindIO adataper need custom switch on.")
243
- return
244
- ttp_lib_path = os.getenv("MS_MINDIO_TTP_LIB_PATH")
245
- if ttp_lib_path is None or os.path.isfile(ttp_lib_path) is False:
246
- logger.warning(
247
- "MindIO adataper switch on, but ttp library path is not correct.")
248
- return
249
- self.enable = True
250
- self._controller_ip = controller_ip
251
- self._controller_port = controller_port
252
- self._ckpt_save_path = ckpt_save_path
253
-
254
- def wrapper_ttp_persist(self, func):
255
- """
256
- This method is used to wrapper TTP exception handler for the input func.
257
-
258
- Args:
259
- func (function): train method that need to be wrapper.
260
-
261
- Returns:
262
- Function, if the TTP is enabled, return the encapsulated function,
263
- otherwise the original function is returned.
264
-
265
- """
266
- if self.enable:
267
- return self.ttp.ttp_to_persist(func)
268
- return func
269
-
270
- def _init_ttp(self, run_context):
271
- """ Init Mindio TTP, used internal. """
272
- logger.info("Begin to init ttp.")
273
- dev_num = _get_device_num()
274
-
275
- cb_params = run_context.original_args()
276
- param_layout_dict = cb_params.train_network.parameter_layout_dict
277
- dp = _get_dp_from_layout(param_layout_dict)
278
- logger.info("Init TTP with dp: {}.".format(dp))
279
-
280
- self.ttp.ttp_register_save_ckpt_handler(_save_checkpoint_on_failure)
281
- self.ttp.ttp_register_rename_handler(_rename_save_result)
282
-
283
- world_size = dev_num
284
- cur_rank = get_rank()
285
- is_odd = len(dp) % 2
286
- replica = 2 if is_odd else len(dp) // 2
287
- enable_local_copy = False
288
- if cur_rank == 0:
289
- logger.info("Begin to start ttp controller.")
290
- self.ttp.ttp_init_controller(
291
- cur_rank, world_size, replica, enable_local_copy)
292
- self.ttp.ttp_start_controller(
293
- self._controller_ip, self._controller_port)
294
- logger.info("Finish start ttp controller.")
295
-
296
- logger.info("Begin to start ttp processor.")
297
- self.ttp.ttp_init_processor(cur_rank, dp, len(
298
- dp), world_size, replica, enable_local_copy)
299
- self.ttp.ttp_start_processor(
300
- self._controller_ip, self._controller_port)
301
- logger.info("Finished start ttp processor.")
302
-
303
- logger.info("Finish init ttp.")
304
-
305
- def on_train_step_end(self, run_context):
306
- """
307
- Init TTP Controller only once after first step finished.
308
- And report status to MindIO TTP after every step finished.
309
-
310
- Args:
311
- run_context (RunContext): Context of the train running. Refer to
312
- :class:`mindspore.train.RunContext` for detail.
313
-
314
- """
315
-
316
- if self.enable is False:
317
- return
318
- pp_num = _get_auto_parallel_context("pipeline_stages")
319
- if pp_num < 2:
320
- self.enable = False
321
- return
322
- cb_params = run_context.original_args()
323
- if cb_params.dataset_sink_mode is True and cb_params.sink_size > 1:
324
- self.enable = False
325
- return
326
- if self.has_init is False:
327
- self.has_init = True
328
- self._init_ttp(run_context)
329
- _flush_from_cache(cb_params)
330
- cur_rank = get_rank()
331
- append_dict = {}
332
- append_dict["cur_epoch_num"] = cb_params.cur_epoch_num
333
- append_dict["cur_step_num"] = int(
334
- (cb_params.cur_step_num - 1) % cb_params.batch_num + 1)
335
- append_dict["cur_rank"] = cur_rank
336
- append_dict["batch_num"] = cb_params.batch_num
337
- append_dict["global_step"] = cb_params.cur_step_num
338
-
339
- save_params = _convert_net_to_param_list(cb_params.train_network)
340
- save_params_copy = copy.deepcopy(save_params)
341
-
342
- logger.info("Set ckpt args to TTP.")
343
- self.ttp.ttp_set_ckpt_args(
344
- (self._ckpt_save_path, save_params_copy, append_dict))
345
- logger.info("Set optimizer finish step status to TTP.")
346
- self.ttp.ttp_end_updating_os(cb_params.cur_step_num)
347
-
348
- @staticmethod
349
- def load_checkpoint_with_backup(ckpt_file_path, strategy_file_path, net):
350
- """
351
- Load checkpoint into network, and use strategy file to find backup checkpoint file
352
- when origin checkpoint file not found.
353
-
354
- Note:
355
- This API must be called after the communication is initialized because the cluster information
356
- needs to be obtained internally.
357
-
358
- Args:
359
- ckpt_file_path (str): the checkpoint file to be loaded.
360
- strategy_file_path (str): strategy file path for current rank.
361
- net (Cell): network that needs to load checkpoint.
362
-
363
- Returns:
364
- Dict, checkpoint weights after loaded.
365
-
366
- Raises:
367
- ValueError: Failed to load the checkpoint file.
368
-
369
- Examples:
370
- >>> import numpy as np
371
- >>> from mindspore import nn
372
- >>> from mindspore.train import Model, MindIOTTPAdapter
373
- >>> from mindspore import dataset as ds
374
- >>> ms.set_context(mode=ms.GRAPH_MODE)
375
- >>> ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True)
376
- >>> init()
377
- >>> ms.set_seed(1)
378
- >>> class Network(nn.Cell):
379
- ... def __init__(self):
380
- ... super().__init__()
381
- ... self.flatten = nn.Flatten()
382
- ... self.fc = nn.Dense(28*28, 10, weight_init="normal", bias_init="zeros")
383
- ... self.relu = nn.ReLU()
384
- ...
385
- ... def construct(self, x):
386
- ... x = self.flatten(x)
387
- ... logits = self.relu(self.fc(x))
388
- ... return logits
389
- >>>
390
- >>> net = Network()
391
- >>>
392
- >>> def create_dataset(batch_size):
393
- ... dataset_path = os.getenv("DATA_PATH")
394
- ... rank_id = get_rank()
395
- ... rank_size = get_group_size()
396
- ... dataset = ds.MnistDataset(dataset_path, num_shards=rank_size, shard_id=rank_id)
397
- ... image_transforms = [
398
- ... ds.vision.Rescale(1.0 / 255.0, 0),
399
- ... ds.vision.Normalize(mean=(0.1307,), std=(0.3081,)),
400
- ... ds.vision.HWC2CHW()
401
- ... ]
402
- ... label_transform = ds.transforms.TypeCast(ms.int32)
403
- ... dataset = dataset.map(image_transforms, 'image')
404
- ... dataset = dataset.map(label_transform, 'label')
405
- ... dataset = dataset.batch(batch_size)
406
- ... return dataset
407
- >>> data_set = create_dataset(32)
408
- >>> ckpt_file= "./rank_5/iteration-1_40.ckpt"
409
- >>> strategy_file = "./src_pipeline_strategys/src_strategy_5.ckpt"
410
- >>> param_dict = MindIOTTPAdapter.load_checkpoint_with_backup(ckpt_file, stragegy_file, net)
411
- >>> data_set.set_init_step(param_dict["global_step"])
412
- """
413
- logger.info("Start load checkpoint with strategy file.")
414
- try:
415
- param_dict = ms.load_checkpoint(ckpt_file_path)
416
- except ValueError as e:
417
- logger.warning(
418
- "Loading origin checkpoint file failed, the reason is:{}.".format(str(e)))
419
- dp = _get_dp_from_layout(strategy_file_path)
420
- rank = get_rank()
421
- logger.info(
422
- "Can't load origin checkpoint file, found dp:{}.".format(dp))
423
- for i in dp:
424
- if i == rank:
425
- continue
426
- new_ckpt = ckpt_file_path.replace(
427
- f"/rank_{rank}/", f"/rank_{str(i)}/")
428
- if not os.path.isfile(new_ckpt):
429
- continue
430
- try:
431
- param_dict = ms.load_checkpoint(new_ckpt)
432
- except ValueError as e1:
433
- logger.warning(
434
- "Loading strategy checkpoint file failed, the reason is:{}.".format(str(e1)))
435
- param_dict = None
436
- if param_dict:
437
- logger.info("Found param dict, load it into network.")
438
- ms.load_param_into_net(net, param_dict)
439
- else:
440
- raise ValueError(
441
- "Load checkpoint file failed, please check your config is set correctly.")
442
- logger.info("Finish load checkpoint with strategy file.")
443
- return param_dict