mindspore 2.3.0__cp39-cp39-win_amd64.whl → 2.4.1__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (287) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/__init__.py +3 -1
  3. mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
  4. mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
  5. mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
  6. mindspore/_checkparam.py +50 -9
  7. mindspore/_extends/parse/compile_config.py +41 -0
  8. mindspore/_extends/parse/parser.py +9 -7
  9. mindspore/_extends/parse/standard_method.py +52 -14
  10. mindspore/_extends/pijit/pijit_func_white_list.py +350 -24
  11. mindspore/amp.py +24 -10
  12. mindspore/avcodec-59.dll +0 -0
  13. mindspore/avdevice-59.dll +0 -0
  14. mindspore/avfilter-8.dll +0 -0
  15. mindspore/avformat-59.dll +0 -0
  16. mindspore/avutil-57.dll +0 -0
  17. mindspore/common/__init__.py +6 -4
  18. mindspore/common/_pijit_context.py +190 -0
  19. mindspore/common/_register_for_tensor.py +2 -1
  20. mindspore/common/_tensor_overload.py +139 -0
  21. mindspore/common/api.py +102 -87
  22. mindspore/common/dump.py +5 -6
  23. mindspore/common/generator.py +1 -7
  24. mindspore/common/hook_handle.py +14 -26
  25. mindspore/common/initializer.py +51 -15
  26. mindspore/common/mindir_util.py +2 -2
  27. mindspore/common/parameter.py +62 -15
  28. mindspore/common/recompute.py +39 -9
  29. mindspore/common/sparse_tensor.py +7 -3
  30. mindspore/common/tensor.py +183 -37
  31. mindspore/communication/__init__.py +1 -1
  32. mindspore/communication/_comm_helper.py +38 -3
  33. mindspore/communication/comm_func.py +315 -60
  34. mindspore/communication/management.py +14 -14
  35. mindspore/context.py +132 -22
  36. mindspore/dataset/__init__.py +1 -1
  37. mindspore/dataset/audio/__init__.py +1 -1
  38. mindspore/dataset/core/config.py +7 -0
  39. mindspore/dataset/core/validator_helpers.py +7 -0
  40. mindspore/dataset/engine/cache_client.py +1 -1
  41. mindspore/dataset/engine/datasets.py +72 -44
  42. mindspore/dataset/engine/datasets_audio.py +7 -7
  43. mindspore/dataset/engine/datasets_standard_format.py +53 -3
  44. mindspore/dataset/engine/datasets_text.py +20 -20
  45. mindspore/dataset/engine/datasets_user_defined.py +174 -104
  46. mindspore/dataset/engine/datasets_vision.py +33 -33
  47. mindspore/dataset/engine/iterators.py +29 -0
  48. mindspore/dataset/engine/obs/util.py +7 -0
  49. mindspore/dataset/engine/queue.py +114 -60
  50. mindspore/dataset/engine/serializer_deserializer.py +2 -2
  51. mindspore/dataset/engine/validators.py +34 -14
  52. mindspore/dataset/text/__init__.py +1 -4
  53. mindspore/dataset/transforms/__init__.py +0 -3
  54. mindspore/dataset/utils/line_reader.py +2 -0
  55. mindspore/dataset/vision/__init__.py +1 -4
  56. mindspore/dataset/vision/utils.py +1 -1
  57. mindspore/dataset/vision/validators.py +2 -1
  58. mindspore/dnnl.dll +0 -0
  59. mindspore/{nn/extend → experimental/es}/__init__.py +4 -11
  60. mindspore/experimental/es/embedding_service.py +883 -0
  61. mindspore/{nn/layer → experimental/es}/embedding_service_layer.py +218 -30
  62. mindspore/experimental/llm_boost/__init__.py +21 -0
  63. mindspore/{nn/extend/layer → experimental/llm_boost/atb}/__init__.py +4 -8
  64. mindspore/experimental/llm_boost/atb/boost_base.py +211 -0
  65. mindspore/experimental/llm_boost/atb/llama_boost.py +115 -0
  66. mindspore/experimental/llm_boost/atb/qwen_boost.py +101 -0
  67. mindspore/experimental/llm_boost/register.py +129 -0
  68. mindspore/experimental/llm_boost/utils.py +31 -0
  69. mindspore/experimental/optim/adamw.py +85 -0
  70. mindspore/experimental/optim/optimizer.py +3 -0
  71. mindspore/hal/__init__.py +3 -3
  72. mindspore/hal/contiguous_tensors_handle.py +175 -0
  73. mindspore/hal/stream.py +18 -0
  74. mindspore/include/api/model_group.h +13 -1
  75. mindspore/include/api/types.h +10 -10
  76. mindspore/include/dataset/config.h +2 -2
  77. mindspore/include/dataset/constants.h +2 -2
  78. mindspore/include/dataset/execute.h +2 -2
  79. mindspore/include/dataset/vision.h +4 -0
  80. mindspore/jpeg62.dll +0 -0
  81. mindspore/log.py +1 -1
  82. mindspore/mindrecord/filewriter.py +68 -51
  83. mindspore/mindspore_backend.dll +0 -0
  84. mindspore/mindspore_common.dll +0 -0
  85. mindspore/mindspore_core.dll +0 -0
  86. mindspore/mindspore_glog.dll +0 -0
  87. mindspore/mindspore_np_dtype.dll +0 -0
  88. mindspore/mindspore_ops.dll +0 -0
  89. mindspore/mint/__init__.py +983 -46
  90. mindspore/mint/distributed/__init__.py +31 -0
  91. mindspore/mint/distributed/distributed.py +254 -0
  92. mindspore/mint/nn/__init__.py +268 -23
  93. mindspore/mint/nn/functional.py +125 -19
  94. mindspore/mint/nn/layer/__init__.py +39 -0
  95. mindspore/mint/nn/layer/activation.py +133 -0
  96. mindspore/mint/nn/layer/normalization.py +477 -0
  97. mindspore/mint/nn/layer/pooling.py +110 -0
  98. mindspore/mint/optim/adamw.py +26 -13
  99. mindspore/mint/special/__init__.py +63 -0
  100. mindspore/multiprocessing/__init__.py +2 -1
  101. mindspore/nn/__init__.py +0 -1
  102. mindspore/nn/cell.py +276 -96
  103. mindspore/nn/layer/activation.py +211 -44
  104. mindspore/nn/layer/basic.py +137 -10
  105. mindspore/nn/layer/embedding.py +137 -2
  106. mindspore/nn/layer/normalization.py +101 -5
  107. mindspore/nn/layer/padding.py +34 -48
  108. mindspore/nn/layer/pooling.py +161 -7
  109. mindspore/nn/layer/transformer.py +3 -3
  110. mindspore/nn/loss/__init__.py +2 -2
  111. mindspore/nn/loss/loss.py +84 -6
  112. mindspore/nn/optim/__init__.py +2 -1
  113. mindspore/nn/optim/adadelta.py +1 -1
  114. mindspore/nn/optim/adam.py +1 -1
  115. mindspore/nn/optim/lamb.py +1 -1
  116. mindspore/nn/optim/tft_wrapper.py +124 -0
  117. mindspore/nn/wrap/cell_wrapper.py +12 -23
  118. mindspore/nn/wrap/grad_reducer.py +5 -5
  119. mindspore/nn/wrap/loss_scale.py +17 -3
  120. mindspore/numpy/__init__.py +1 -1
  121. mindspore/numpy/array_creations.py +65 -68
  122. mindspore/numpy/array_ops.py +64 -60
  123. mindspore/numpy/fft.py +610 -75
  124. mindspore/numpy/logic_ops.py +11 -10
  125. mindspore/numpy/math_ops.py +85 -84
  126. mindspore/numpy/utils_const.py +4 -4
  127. mindspore/opencv_core452.dll +0 -0
  128. mindspore/opencv_imgcodecs452.dll +0 -0
  129. mindspore/opencv_imgproc452.dll +0 -0
  130. mindspore/ops/__init__.py +6 -4
  131. mindspore/ops/_grad_experimental/grad_array_ops.py +0 -11
  132. mindspore/ops/_grad_experimental/grad_comm_ops.py +67 -4
  133. mindspore/ops/_grad_experimental/grad_math_ops.py +0 -22
  134. mindspore/ops/_vmap/vmap_array_ops.py +2 -4
  135. mindspore/ops/_vmap/vmap_math_ops.py +17 -1
  136. mindspore/ops/_vmap/vmap_nn_ops.py +43 -2
  137. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +91 -7
  138. mindspore/ops/auto_generate/gen_arg_dtype_cast.py +2 -0
  139. mindspore/ops/auto_generate/gen_extend_func.py +767 -13
  140. mindspore/ops/auto_generate/gen_ops_def.py +2452 -364
  141. mindspore/ops/auto_generate/gen_ops_prim.py +5442 -1756
  142. mindspore/ops/auto_generate/pyboost_inner_prim.py +176 -56
  143. mindspore/ops/composite/base.py +85 -48
  144. mindspore/ops/composite/multitype_ops/_compile_utils.py +1 -0
  145. mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -2
  146. mindspore/ops/function/__init__.py +22 -0
  147. mindspore/ops/function/array_func.py +492 -153
  148. mindspore/ops/function/debug_func.py +113 -1
  149. mindspore/ops/function/fft_func.py +15 -2
  150. mindspore/ops/function/grad/grad_func.py +3 -2
  151. mindspore/ops/function/math_func.py +564 -207
  152. mindspore/ops/function/nn_func.py +817 -383
  153. mindspore/ops/function/other_func.py +3 -2
  154. mindspore/ops/function/random_func.py +402 -12
  155. mindspore/ops/function/reshard_func.py +13 -11
  156. mindspore/ops/function/sparse_unary_func.py +1 -1
  157. mindspore/ops/function/vmap_func.py +3 -2
  158. mindspore/ops/functional.py +24 -14
  159. mindspore/ops/op_info_register.py +3 -3
  160. mindspore/ops/operations/__init__.py +7 -2
  161. mindspore/ops/operations/_grad_ops.py +2 -76
  162. mindspore/ops/operations/_infer_ops.py +1 -1
  163. mindspore/ops/operations/_inner_ops.py +71 -94
  164. mindspore/ops/operations/array_ops.py +14 -146
  165. mindspore/ops/operations/comm_ops.py +63 -53
  166. mindspore/ops/operations/custom_ops.py +83 -19
  167. mindspore/ops/operations/debug_ops.py +42 -10
  168. mindspore/ops/operations/manually_defined/_inner.py +12 -0
  169. mindspore/ops/operations/manually_defined/ops_def.py +273 -20
  170. mindspore/ops/operations/math_ops.py +12 -223
  171. mindspore/ops/operations/nn_ops.py +20 -114
  172. mindspore/ops/operations/other_ops.py +7 -4
  173. mindspore/ops/operations/random_ops.py +46 -1
  174. mindspore/ops/primitive.py +18 -6
  175. mindspore/ops_generate/arg_dtype_cast.py +2 -0
  176. mindspore/ops_generate/gen_aclnn_implement.py +11 -11
  177. mindspore/ops_generate/gen_constants.py +36 -0
  178. mindspore/ops_generate/gen_ops.py +67 -52
  179. mindspore/ops_generate/gen_ops_inner_prim.py +1 -1
  180. mindspore/ops_generate/gen_pyboost_func.py +131 -47
  181. mindspore/ops_generate/op_proto.py +10 -3
  182. mindspore/ops_generate/pyboost_utils.py +14 -1
  183. mindspore/ops_generate/template.py +43 -21
  184. mindspore/parallel/__init__.py +3 -1
  185. mindspore/parallel/_auto_parallel_context.py +31 -9
  186. mindspore/parallel/_cell_wrapper.py +85 -0
  187. mindspore/parallel/_parallel_serialization.py +47 -19
  188. mindspore/parallel/_tensor.py +127 -13
  189. mindspore/parallel/_utils.py +53 -22
  190. mindspore/parallel/algo_parameter_config.py +5 -5
  191. mindspore/parallel/checkpoint_transform.py +46 -39
  192. mindspore/parallel/cluster/process_entity/__init__.py +1 -1
  193. mindspore/parallel/cluster/process_entity/_api.py +31 -23
  194. mindspore/parallel/cluster/process_entity/_utils.py +2 -27
  195. mindspore/parallel/parameter_broadcast.py +3 -4
  196. mindspore/parallel/shard.py +162 -31
  197. mindspore/parallel/transform_safetensors.py +1146 -0
  198. mindspore/profiler/__init__.py +2 -1
  199. mindspore/profiler/common/constant.py +29 -0
  200. mindspore/profiler/common/registry.py +47 -0
  201. mindspore/profiler/common/util.py +28 -0
  202. mindspore/profiler/dynamic_profiler.py +694 -0
  203. mindspore/profiler/envprofiling.py +17 -19
  204. mindspore/profiler/parser/ascend_analysis/constant.py +18 -0
  205. mindspore/profiler/parser/ascend_analysis/file_manager.py +25 -4
  206. mindspore/profiler/parser/ascend_analysis/function_event.py +43 -19
  207. mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +31 -26
  208. mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +56 -10
  209. mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +55 -8
  210. mindspore/profiler/parser/ascend_analysis/path_manager.py +313 -0
  211. mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +27 -20
  212. mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +9 -2
  213. mindspore/profiler/parser/ascend_msprof_exporter.py +5 -4
  214. mindspore/profiler/parser/ascend_timeline_generator.py +27 -25
  215. mindspore/profiler/parser/base_timeline_generator.py +19 -25
  216. mindspore/profiler/parser/cpu_gpu_timeline_generator.py +25 -12
  217. mindspore/profiler/parser/framework_parser.py +1 -391
  218. mindspore/profiler/parser/gpu_analysis/__init__.py +14 -0
  219. mindspore/profiler/parser/gpu_analysis/function_event.py +44 -0
  220. mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +89 -0
  221. mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +72 -0
  222. mindspore/profiler/parser/memory_usage_parser.py +0 -154
  223. mindspore/profiler/parser/profiler_info.py +78 -6
  224. mindspore/profiler/profiler.py +153 -0
  225. mindspore/profiler/profiling.py +285 -413
  226. mindspore/rewrite/__init__.py +1 -2
  227. mindspore/rewrite/common/namespace.py +4 -4
  228. mindspore/rewrite/symbol_tree/symbol_tree.py +3 -3
  229. mindspore/run_check/_check_version.py +39 -104
  230. mindspore/safeguard/rewrite_obfuscation.py +591 -247
  231. mindspore/swresample-4.dll +0 -0
  232. mindspore/swscale-6.dll +0 -0
  233. mindspore/tinyxml2.dll +0 -0
  234. mindspore/train/__init__.py +4 -3
  235. mindspore/train/_utils.py +105 -19
  236. mindspore/train/amp.py +171 -53
  237. mindspore/train/callback/__init__.py +2 -2
  238. mindspore/train/callback/_callback.py +4 -4
  239. mindspore/train/callback/_checkpoint.py +97 -31
  240. mindspore/train/callback/_cluster_monitor.py +1 -1
  241. mindspore/train/callback/_flops_collector.py +1 -0
  242. mindspore/train/callback/_loss_monitor.py +3 -3
  243. mindspore/train/callback/_on_request_exit.py +145 -31
  244. mindspore/train/callback/_summary_collector.py +5 -5
  245. mindspore/train/callback/_tft_register.py +375 -0
  246. mindspore/train/dataset_helper.py +15 -3
  247. mindspore/train/metrics/metric.py +3 -3
  248. mindspore/train/metrics/roc.py +4 -4
  249. mindspore/train/mind_ir_pb2.py +44 -39
  250. mindspore/train/model.py +154 -58
  251. mindspore/train/serialization.py +342 -128
  252. mindspore/turbojpeg.dll +0 -0
  253. mindspore/utils/__init__.py +21 -0
  254. mindspore/utils/utils.py +60 -0
  255. mindspore/version.py +1 -1
  256. {mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/METADATA +13 -7
  257. {mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/RECORD +260 -254
  258. {mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/WHEEL +1 -1
  259. mindspore/include/c_api/ms/abstract.h +0 -67
  260. mindspore/include/c_api/ms/attribute.h +0 -197
  261. mindspore/include/c_api/ms/base/handle_types.h +0 -43
  262. mindspore/include/c_api/ms/base/macros.h +0 -32
  263. mindspore/include/c_api/ms/base/status.h +0 -33
  264. mindspore/include/c_api/ms/base/types.h +0 -283
  265. mindspore/include/c_api/ms/context.h +0 -102
  266. mindspore/include/c_api/ms/graph.h +0 -160
  267. mindspore/include/c_api/ms/node.h +0 -606
  268. mindspore/include/c_api/ms/tensor.h +0 -161
  269. mindspore/include/c_api/ms/value.h +0 -84
  270. mindspore/mindspore_shared_lib.dll +0 -0
  271. mindspore/nn/extend/basic.py +0 -140
  272. mindspore/nn/extend/embedding.py +0 -143
  273. mindspore/nn/extend/layer/normalization.py +0 -109
  274. mindspore/nn/extend/pooling.py +0 -117
  275. mindspore/nn/layer/embedding_service.py +0 -531
  276. mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +0 -93
  277. mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +0 -66
  278. mindspore/ops/extend/__init__.py +0 -53
  279. mindspore/ops/extend/array_func.py +0 -218
  280. mindspore/ops/extend/math_func.py +0 -76
  281. mindspore/ops/extend/nn_func.py +0 -308
  282. mindspore/ops/silent_check.py +0 -162
  283. mindspore/profiler/parser/msadvisor_analyzer.py +0 -82
  284. mindspore/profiler/parser/msadvisor_parser.py +0 -240
  285. mindspore/train/callback/_mindio_ttp.py +0 -443
  286. {mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/entry_points.txt +0 -0
  287. {mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,8 @@
13
13
  # limitations under the License.
14
14
  # ============================================================================
15
15
  """Utils of auto parallel"""
16
+ import os
17
+ from time import perf_counter
16
18
  from importlib import import_module
17
19
  import numpy as np
18
20
  import mindspore as ms
@@ -22,12 +24,13 @@ from mindspore.common.tensor import Tensor
22
24
  from mindspore.common.dtype import dtype_to_nptype
23
25
  from mindspore.common import dtype as mstype
24
26
  from mindspore.communication.management import get_group_size, get_rank
27
+ from mindspore.communication._comm_helper import _is_initialized
25
28
  from mindspore.parallel._auto_parallel_context import auto_parallel_context
26
29
  from mindspore.common.seed import get_seed
27
30
  from mindspore._c_expression import GraphExecutor_
28
- from mindspore.parallel._tensor import _load_tensor_by_layout
31
+ from mindspore.parallel._tensor import _load_tensor_by_layout, _load_tensor_shape_by_layout
29
32
 
30
- SUPPORTED_TUPLE_IN_TUPLE_STRATEGY = ["GroupedMatmul", "FusedInferAttentionScore"]
33
+ SUPPORTED_TUPLE_IN_TUPLE_STRATEGY = ["GroupedMatmul", "FusedInferAttentionScore", "Custom"]
31
34
 
32
35
 
33
36
  def _get_parallel_mode():
@@ -45,6 +48,16 @@ def _is_in_auto_parallel_mode():
45
48
  return _get_parallel_mode() in [ms.ParallelMode.SEMI_AUTO_PARALLEL, ms.ParallelMode.AUTO_PARALLEL]
46
49
 
47
50
 
51
+ def _is_parallel_mode():
52
+ if not _is_initialized() or context.get_context('mode') == context.PYNATIVE_MODE:
53
+ return False
54
+ if os.getenv("RUN_MODE") != "predict":
55
+ return False
56
+ if get_group_size() > 1 and _get_parallel_mode() == ms.ParallelMode.STAND_ALONE:
57
+ return True
58
+ return False
59
+
60
+
48
61
  def _is_in_data_parallel_mode():
49
62
  return _get_parallel_mode() == ms.ParallelMode.DATA_PARALLEL
50
63
 
@@ -92,31 +105,49 @@ def _need_to_full():
92
105
  return not _get_full_batch()
93
106
 
94
107
 
108
+ class ParallelParamInitProfCtx:
109
+ """Collect parallel param initialization performance context mgr."""
110
+
111
+ def __init__(self, parameter, func_name):
112
+ self.parameter = parameter
113
+ self.func_name = func_name
114
+ self.start_timestamp = None
115
+
116
+ def __enter__(self):
117
+ self.start_timestamp = perf_counter()
118
+ return self
119
+
120
+ def __exit__(self, exc_type, exc_value, exc_traceback):
121
+ end_timestamp = perf_counter()
122
+ duration = end_timestamp - self.start_timestamp
123
+ if os.getenv("MS_DEV_PARAM_INIT_PROF_COLLECT"):
124
+ logger.warning(f"{self.func_name}: {self.parameter.name}, shape: {self.parameter.shape}, "
125
+ f"sliced: {self.parameter.sliced}, duration: {duration}")
126
+
127
+
95
128
  def _slice_parameter(parameter, phase, layout):
96
129
  """Slice python parameter obj according to the layout."""
97
- is_train_phase = phase.startswith('train')
98
- is_prefill_phase = phase.startswith('prefill')
99
- if layout is not None and parameter.from_ckpt and not is_train_phase:
100
- is_opt_shard_group = layout[5]
101
- if not parameter.sliced and is_prefill_phase and is_opt_shard_group:
130
+ # graph_executor.updata_param_node_default_input(phase, {parameter.name: parameter})
131
+ if getattr(parameter, "init_param", False):
132
+ if layout is None:
133
+ parameter.sliced = True
134
+ return
135
+ if not parameter.sliced:
136
+ rank = get_rank()
137
+ new_tensor_shape = _load_tensor_shape_by_layout(parameter, layout, rank)
138
+ parameter.shape = new_tensor_shape
139
+ else:
140
+ graph_executor = GraphExecutor_.get_instance()
141
+ new_param = parameter.init_data(layout, set_sliced=True)
142
+ parameter = new_param
143
+ graph_executor.updata_param_node_default_input(phase, {parameter.name: parameter})
144
+ if layout is None:
145
+ parameter.sliced = True
146
+ return
147
+ if not parameter.sliced:
102
148
  rank = get_rank()
103
149
  new_tensor = _load_tensor_by_layout(parameter, layout, rank)
104
150
  parameter.set_data(new_tensor, True)
105
- return
106
- layout_shape = layout[2]
107
- parameter.shape = tuple(layout_shape)
108
- return
109
- graph_executor = GraphExecutor_.get_instance()
110
- new_param = parameter.init_data(layout, set_sliced=True)
111
- parameter = new_param
112
- graph_executor.updata_param_node_default_input(phase, {parameter.name: parameter})
113
- if layout is None:
114
- parameter.sliced = True
115
- return
116
- if not parameter.sliced:
117
- rank = get_rank()
118
- new_tensor = _load_tensor_by_layout(parameter, layout, rank)
119
- parameter.set_data(new_tensor, True)
120
151
 
121
152
 
122
153
  def _slice_tensor(tensor, layout, rank_id):
@@ -234,7 +234,7 @@ def set_algo_parameters(**kwargs):
234
234
 
235
235
  Args:
236
236
  fully_use_devices (bool): Whether ONLY searching strategies that fully use all available devices.
237
- Default: ``True`` . For example with 8 devices available, if set ``True`` , strategy (4, 1) will not be
237
+ Default: ``False`` . For example with 8 devices available, if set ``True`` , strategy (4, 1) will not be
238
238
  included in ReLU's candidate strategies, because strategy (4, 1) only utilizes 4 devices.
239
239
  elementwise_op_strategy_follow (bool): Whether the elementwise operator has the consistent strategies as its
240
240
  subsequent operators. Elementwise operators refer to operators that operate on input element by element,
@@ -264,14 +264,14 @@ def set_algo_parameters(**kwargs):
264
264
 
265
265
  For the Ascend devices, users need to prepare the rank table, set rank_id and device_id.
266
266
  Please see the `rank table startup
267
- <https://www.mindspore.cn/tutorials/experts/en/master/parallel/rank_table.html>`_
267
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/rank_table.html>`_
268
268
  for more details.
269
269
 
270
270
  For the GPU devices, users need to prepare the host file and mpi, please see the `mpirun startup
271
- <https://www.mindspore.cn/tutorials/experts/en/master/parallel/mpirun.html>`_ .
271
+ <https://www.mindspore.cn/docs/en/master/model_train/parallel/mpirun.html>`_ .
272
272
 
273
273
  For the CPU device, users need to write a dynamic cluster startup script, please see the `Dynamic Cluster
274
- Startup <https://www.mindspore.cn/tutorials/experts/en/master/parallel/dynamic_cluster.html>`_ .
274
+ Startup <https://www.mindspore.cn/docs/en/master/model_train/parallel/dynamic_cluster.html>`_ .
275
275
 
276
276
  >>> import numpy as np
277
277
  >>> import mindspore as ms
@@ -386,7 +386,7 @@ def reset_algo_parameters():
386
386
 
387
387
  After reset, the values of the attributes are:
388
388
 
389
- - fully_use_devices: True.
389
+ - fully_use_devices: False.
390
390
  - elementwise_op_strategy_follow: False.
391
391
  - enable_algo_approxi: False.
392
392
  - algo_approxi_epsilon: 0.1.
@@ -22,12 +22,12 @@ from collections import defaultdict
22
22
  import numpy as np
23
23
  import mindspore as ms
24
24
  from mindspore.common import dtype as mstype
25
- from mindspore.parallel._utils import _is_in_auto_parallel_mode
25
+ from mindspore.parallel._utils import _is_in_auto_parallel_mode, _get_pipeline_stages
26
26
  from mindspore.parallel._parallel_serialization import _rank_list_for_transform_parallel_checkpoint, \
27
27
  _transform_parallel_checkpoint, _get_device_num_from_strategy, _make_dir, \
28
28
  _extract_layout_map, _extract_src_dst_layout_map, _parameter_not_in_local_stage, _extract_pipeline_stage_num, \
29
29
  _merge_protobuf_strategy, _merge_json_strategy, _extract_src_dst_layout_map_by_src
30
-
30
+ from mindspore.parallel.transform_safetensors import _transform_safetensors, _collect_safetensor_files
31
31
 
32
32
  __all__ = ["merge_pipeline_strategys", "rank_list_for_transform", "transform_checkpoint_by_rank",
33
33
  "transform_checkpoints", "sync_pipeline_shared_parameters", "load_segmented_checkpoints"]
@@ -37,7 +37,7 @@ def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
37
37
  """
38
38
  Merge parallel strategy between all pipeline stages in pipeline parallel mode.
39
39
  For more details about converting distributed Checkpoint, please refer to
40
- `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/master/parallel/model_transformation.html>`_.
40
+ `Model Transformation <https://www.mindspore.cn/docs/en/master/model_train/parallel/model_transformation.html>`_.
41
41
 
42
42
  Note:
43
43
  Strategy file of each pipeline stage should be included in src_strategy_dirs.
@@ -72,12 +72,11 @@ def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
72
72
  _merge_json_strategy(src_strategy_files_json, dst_strategy_file)
73
73
 
74
74
 
75
-
76
75
  def rank_list_for_transform(rank_id, src_strategy_file=None, dst_strategy_file=None):
77
76
  """
78
77
  List of original distributed checkpoint rank index for obtaining the target checkpoint of a rank_id during the
79
78
  distributed checkpoint conversion. For more details about converting distributed Checkpoint, please refer to
80
- `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/master/parallel/model_transformation.html>`_.
79
+ `Model Transformation <https://www.mindspore.cn/docs/en/master/model_train/parallel/model_transformation.html>`_.
81
80
 
82
81
  Args:
83
82
  rank_id (int): The rank of which distributed checkpoint needs to be obtained after conversion.
@@ -132,7 +131,9 @@ def rank_list_for_transform(rank_id, src_strategy_file=None, dst_strategy_file=N
132
131
  src_rank_id_start = src_pipeline_stage_id * src_stage_device_num
133
132
  result_set.update([src_rank_id_start + rank for rank in needed_rank_list_in_local_stage])
134
133
  handled_pipeline_stage.append(src_pipeline_stage_id)
135
- return list(result_set)
134
+ result_list = list(result_set)
135
+ result_list.sort(reverse=True)
136
+ return list(result_list)
136
137
 
137
138
 
138
139
  def transform_checkpoint_by_rank(rank_id, checkpoint_files_map, save_checkpoint_file_name,
@@ -140,7 +141,7 @@ def transform_checkpoint_by_rank(rank_id, checkpoint_files_map, save_checkpoint_
140
141
  """
141
142
  Transform distributed checkpoint from source sharding strategy to destination sharding strategy by rank
142
143
  for a network. For more details about converting distributed Checkpoint, please refer to
143
- `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/master/parallel/model_transformation.html>`_.
144
+ `Model Transformation <https://www.mindspore.cn/docs/en/master/model_train/parallel/model_transformation.html>`_.
144
145
 
145
146
  Args:
146
147
  rank_id (int): The rank of which distributed checkpoint needs to be obtained after conversion.
@@ -232,7 +233,7 @@ def _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckp
232
233
  param_attr_dict = defaultdict(dict)
233
234
  param_type_dict = defaultdict(dict)
234
235
  src_strategy_list, dst_strategy_list, stage_id = _extract_src_dst_layout_map_by_src(src_strategy_file, \
235
- dst_strategy_file)
236
+ dst_strategy_file)
236
237
  src_stage_device_num = np.prod(src_strategy_list.get(list(src_strategy_list.keys())[0])[0]) if src_strategy_list \
237
238
  is not None else 1
238
239
  dst_stage_device_num = np.prod(dst_strategy_list.get(list(dst_strategy_list.keys())[0])[0]) if dst_strategy_list \
@@ -357,29 +358,35 @@ def _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix
357
358
 
358
359
 
359
360
  def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
360
- dst_strategy_file=None):
361
+ dst_strategy_file=None, process_num=1, output_format="ckpt"):
361
362
  """
362
363
  Transform distributed checkpoint from source sharding strategy to destination sharding strategy for a rank.
363
364
  For more details about converting distributed Checkpoint, please refer to
364
- `Model Transformation <https://www.mindspore.cn/tutorials/experts/en/master/parallel/model_transformation.html>`_.
365
+ `Model Transformation <https://www.mindspore.cn/docs/en/master/model_train/parallel/model_transformation.html>`_.
365
366
 
366
367
  Note:
367
368
  The `src_checkpoints_dir` directory structure should be organized like "src_checkpoints_dir/rank_0/a.ckpt", the
368
369
  rank number should be set to a subdirectory and the checkpoint file is stored in this subdirectory. If multiple
369
370
  files exist in a rank directory, the last file in the lexicgraphic order would be selected.
370
371
 
372
+ The number of multiprocess settings is related to the size of the host, and it is not recommended to set it
373
+ too large, otherwise it may cause freezing.
374
+
371
375
  Args:
372
376
  src_checkpoints_dir (str): The source checkpoints directory.
373
377
  dst_checkpoints_dir (str): The destination checkpoints directory to save the converted checkpoints.
374
378
  ckpt_prefix (str): The destination checkpoint name prefix.
375
- src_strategy_file (str): Name of source sharding strategy file which saved by
379
+ src_strategy_file (str, optional): Name of source sharding strategy file which saved by
376
380
  'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
377
381
  when the 'src_strategy_file' is None, it means that the source sharding strategy is
378
382
  without any sharing for each parameter. Default:None.
379
- dst_strategy_file (str): Name of destination sharding strategy file which saved by
383
+ dst_strategy_file (str, optional): Name of destination sharding strategy file which saved by
380
384
  'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
381
385
  when the 'dst_strategy_file' is None, it means that the destination sharding strategy
382
386
  is without any sharing for each parameter. Default:None.
387
+ process_num (int, optional): Number of processes to use for parallel processing. Defaults: 1.
388
+ output_format (str, optional): Control the format of the output checkpoint after conversion.
389
+ It can be set to either "ckpt" or "safetensors". Default: "ckpt".
383
390
 
384
391
  Raises:
385
392
  ValueError: `src_strategy_file` or `dst_strategy_file` is incorrect.
@@ -393,6 +400,21 @@ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
393
400
  ... "./src_strategy.ckpt", "./dst_strategy.ckpt")
394
401
 
395
402
  """
403
+ all_safetensor_files_map = _collect_safetensor_files(src_checkpoints_dir)
404
+ all_ckpt_files_map = _collect_safetensor_files(src_checkpoints_dir, format='ckpt')
405
+ if all_safetensor_files_map and all_ckpt_files_map:
406
+ raise ValueError("For 'transform_checkpoints', the 'src_checkpoints_dir' cannot contain "
407
+ "both ckpt file and safetensors file simultaneously")
408
+ if all_safetensor_files_map and not all_ckpt_files_map:
409
+ _transform_safetensors(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file,
410
+ dst_strategy_file, process_num, output_format)
411
+ return
412
+ if not all_safetensor_files_map and not all_ckpt_files_map:
413
+ raise ValueError("For 'transform_checkpoints', the 'src_checkpoints_dir' can not be empty.")
414
+ if all_ckpt_files_map and not all_safetensor_files_map and output_format == 'safetensors':
415
+ raise ValueError("For 'transform_checkpoints', 'output_format' can not be 'safetensors' "
416
+ "when 'src_checkpoints_dir' only contains ckpt file.")
417
+
396
418
  if not os.path.isdir(src_checkpoints_dir):
397
419
  raise NotADirectoryError("src_checkpoints_dir {} is not a directory.".format(src_checkpoints_dir))
398
420
  _make_dir(dst_checkpoints_dir, "path")
@@ -419,7 +441,7 @@ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
419
441
  layout_is_passed = src_layout_map and dst_layout_map
420
442
 
421
443
  if layout_is_passed and pipeline_stage_num == 1 and dst_stage_num == 1 and \
422
- src_param_keys.issubset(dst_param_keys) and len(src_param_keys) < len(dst_param_keys):
444
+ src_param_keys.issubset(dst_param_keys) and len(src_param_keys) < len(dst_param_keys):
423
445
  ms.log.info("Transform checkpoint by every pipeline stage.")
424
446
  _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
425
447
  src_strategy_file, dst_strategy_file)
@@ -442,31 +464,13 @@ def _sync_params(name, param, layout):
442
464
  is_send = layout[9]
443
465
  peer_rank = layout[10]
444
466
  sr_tag = layout[11]
445
-
446
- class SharedParameterSyncCell(ms.nn.Cell):
447
- """synchronize cell"""
448
- def __init__(self, param, is_send, peer_rank, sr_tag):
449
- super().__init__()
450
- self.param = param
451
- self.is_send = is_send
452
- self.ret = ms.Tensor([0])
453
-
454
- from mindspore.ops import Send, Receive
455
- if self.is_send:
456
- self.send = Send(sr_tag=sr_tag, dest_rank=peer_rank)
457
- else:
458
- self.receive = Receive(sr_tag=sr_tag, src_rank=peer_rank, shape=param.shape, dtype=param.dtype)
459
-
460
- def construct(self):
461
- if self.is_send:
462
- out = self.send(self.param)
463
- return ms.ops.functional.depend(self.ret, out)
464
-
465
- self.param = self.receive(self.ret)
466
- return ms.ops.functional.depend(self.ret, self.param)
467
-
468
- sync_net = SharedParameterSyncCell(param, is_send, peer_rank, sr_tag)
469
- sync_net()
467
+ if is_send:
468
+ ms.ops.Send(sr_tag=sr_tag, dest_rank=peer_rank)(param)
469
+ else:
470
+ param.assign_value(ms.ops.Receive(sr_tag=sr_tag,
471
+ src_rank=peer_rank,
472
+ shape=param.shape,
473
+ dtype=param.dtype)(param))
470
474
 
471
475
 
472
476
  def sync_pipeline_shared_parameters(net):
@@ -489,7 +493,7 @@ def sync_pipeline_shared_parameters(net):
489
493
  Before running the following examples, you need to configure the communication environment variables.
490
494
 
491
495
  For the Ascend device, users need to write a dynamic cluster startup script, please see the `Dynamic Cluster
492
- Startup <https://www.mindspore.cn/tutorials/experts/en/master/parallel/dynamic_cluster.html>`_ .
496
+ Startup <https://www.mindspore.cn/docs/en/master/model_train/parallel/dynamic_cluster.html>`_ .
493
497
 
494
498
  >>> import numpy as np
495
499
  >>> import mindspore as ms
@@ -562,6 +566,9 @@ def sync_pipeline_shared_parameters(net):
562
566
  "but got {}.".format(type(net)))
563
567
  raise TypeError(msg)
564
568
 
569
+ if _get_pipeline_stages() < 2:
570
+ return
571
+
565
572
  layout_dict = net.parameter_layout_dict
566
573
  if _is_in_auto_parallel_mode() and not layout_dict:
567
574
  from mindspore.common.api import _get_parameter_layout
@@ -15,4 +15,4 @@
15
15
  """Interfaces for ms_run"""
16
16
  from ._api import _Node, _MetaServerNode, _ComputeGraphNode, _ProcessManager
17
17
 
18
- from ._utils import _generate_cmd, _generate_url, _is_local_ip, _send_scale_num, _get_status_and_params
18
+ from ._utils import _generate_cmd, _generate_url, _is_local_ip, _send_scale_num
@@ -19,7 +19,7 @@ import sys
19
19
  import subprocess
20
20
  import mindspore.log as logger
21
21
  from ._utils import _generate_cmd_args_list, _generate_cmd_args_list_with_core, _generate_url,\
22
- _is_local_ip, _send_scale_num, _get_status_and_params
22
+ _is_local_ip, _send_scale_num
23
23
 
24
24
  class _Node:
25
25
  """
@@ -212,6 +212,7 @@ class _ProcessManager:
212
212
  raise ValueError(f"Simulation level is set, worker_num must be 1, but got {self.worker_num}.")
213
213
 
214
214
  for i in range(self.local_worker_num):
215
+ os.environ["DEVICE_ID"] = str(i)
215
216
  node_id, log_name = self._get_node_id_and_log_path(i)
216
217
  if node_id is None:
217
218
  logger.warning(f"Rank ids will be assigned automatically, "
@@ -241,19 +242,6 @@ class _ProcessManager:
241
242
  process = cgn.run()
242
243
  self.cgn_processes.append(process)
243
244
 
244
- def heartbeat_with_scheduler(self):
245
- """
246
- Sends a heartbeat to the scheduler and updates the worker_num and local_worker_num.
247
-
248
- Returns:
249
- bool: True if the network has changed, False otherwise.
250
-
251
- """
252
- network_changed, worker_num, local_worker_num = _get_status_and_params(self.scheduler_url)
253
- self.worker_num = worker_num
254
- self.local_worker_num = local_worker_num
255
- return network_changed
256
-
257
245
  def join_processes(self):
258
246
  """
259
247
  Join all processes to stop.
@@ -261,11 +249,31 @@ class _ProcessManager:
261
249
  so that understandable root cause of exception could be returned.
262
250
  """
263
251
  has_exception = False
264
- for p in self.cgn_processes:
265
- p.wait()
266
- if p.returncode != 0:
267
- has_exception = True
268
- logger.error(f"Worker process {p.pid} exit with exception.")
252
+ success_cgn_processes = set()
253
+ while True:
254
+ # Traversal all workers and kill immediately if any exception happens.
255
+ for p in self.cgn_processes:
256
+ ret_code = p.poll()
257
+ if ret_code is None:
258
+ # This means the process is still running, poll next process.
259
+ continue
260
+ elif ret_code != 0:
261
+ has_exception = True
262
+ logger.error(f"Worker process {p.pid} exit with exception.")
263
+ break
264
+ else:
265
+ success_cgn_processes.add(p)
266
+
267
+ if has_exception:
268
+ logger.warning("There's worker exits with exception, kill all other workers.")
269
+ for p in self.cgn_processes:
270
+ if p.poll() is None:
271
+ p.kill()
272
+ break
273
+ elif len(success_cgn_processes) == len(self.cgn_processes):
274
+ logger.info("All workers successfully exit!")
275
+ break
276
+
269
277
 
270
278
  if self.msn_process:
271
279
  self.msn_process.wait()
@@ -335,10 +343,10 @@ class _ProcessManager:
335
343
  time_out_node_log = re.findall(r"node: .* is timed out", scheduler_log)
336
344
 
337
345
  # Filter out node ids of the processes which exit abnormally.
338
- def node_id_splitter(id):
339
- return re.split(" is timed out", re.split("node: ", id)[1])[0]
340
- for id in time_out_node_log:
341
- time_out_node_ids.append(node_id_splitter(id))
346
+ def node_id_splitter(node_id):
347
+ return re.split(" is timed out", re.split("node: ", node_id)[1])[0]
348
+ for node_id in time_out_node_log:
349
+ time_out_node_ids.append(node_id_splitter(node_id))
342
350
  logger.error(f"Time out nodes are {time_out_node_ids}")
343
351
 
344
352
  os.system(f"grep -rn -E 'ERROR|CRITICAL|Traceback|Error' -C 5 {self.log_dir}")
@@ -16,7 +16,6 @@
16
16
  import os
17
17
  import json
18
18
  import socket
19
- import requests
20
19
  import mindspore.log as logger
21
20
 
22
21
  def _generate_cmd(cmd, cmd_args, output_name):
@@ -25,7 +24,7 @@ def _generate_cmd(cmd, cmd_args, output_name):
25
24
  edirecting the output to a log file.
26
25
 
27
26
  """
28
- if cmd not in ['python', 'pytest']:
27
+ if cmd not in ['python', 'pytest', 'python3']:
29
28
  # If user don't set binary file name, defaulty use 'python' to launch the job.
30
29
  command = f"python {cmd} {' '.join(cmd_args)} > {output_name} 2>&1 &"
31
30
  else:
@@ -99,28 +98,4 @@ def _send_scale_num(url, scale_num):
99
98
  Send an HTTP request to a specified URL, informing scale_num.
100
99
 
101
100
  """
102
- try:
103
- response = requests.post(url, data={"scale_num": scale_num}, timeout=100)
104
- response.raise_for_status()
105
- response_data = response.json()
106
- response_bool = bool(response_data)
107
- return response_bool
108
- except requests.exceptions.RequestException:
109
- return None
110
-
111
-
112
- def _get_status_and_params(url):
113
- """
114
- Send an HTTP request to a specified URL to query status and retrieve partial parameters.
115
-
116
- """
117
- try:
118
- response = requests.get(url, timeout=100)
119
- response.raise_for_status()
120
- response_data = response.json()
121
- network_changed = response_data.get("network_changed")
122
- worker_num = response_data.get("worker_num")
123
- local_worker_num = response_data.get("local_worker_num")
124
- return network_changed, worker_num, local_worker_num
125
- except requests.exceptions.RequestException:
126
- return None
101
+ return ""
@@ -18,6 +18,8 @@ from __future__ import absolute_import
18
18
  __all__ = ["parameter_broadcast"]
19
19
 
20
20
  import numpy as np
21
+ import mindspore as ms
22
+ from mindspore.communication import get_rank, create_group, get_group_size
21
23
 
22
24
 
23
25
  def parameter_broadcast(net, layout, cur_rank=0, initial_rank=0):
@@ -104,9 +106,6 @@ def parameter_broadcast(net, layout, cur_rank=0, initial_rank=0):
104
106
  """
105
107
  if not layout:
106
108
  return
107
- import mindspore as ms
108
- from mindspore import Tensor
109
- from mindspore.communication import get_rank, create_group, get_group_size
110
109
  from mindspore.train._utils import get_parameter_redundancy, remove_param_redundancy
111
110
  from mindspore.nn.wrap.cell_wrapper import AllreduceGraph
112
111
  origin_parallel_mode = ms.get_auto_parallel_context("parallel_mode")
@@ -143,7 +142,7 @@ def parameter_broadcast(net, layout, cur_rank=0, initial_rank=0):
143
142
  raise ValueError(f"For parameter broadcast, the param: {param} can not be found.")
144
143
  real_param = net_param_dict[param]
145
144
  if param not in single_params[cur_rank]:
146
- real_param.set_data(Tensor(np.zeros(real_param.shape), dtype=real_param.dtype))
145
+ real_param.set_data(ms.Tensor(np.zeros(real_param.shape), dtype=real_param.dtype))
147
146
  allreduce_input.append(real_param)
148
147
  if not allreduce_input:
149
148
  continue