mindspore 2.3.0__cp310-cp310-win_amd64.whl → 2.4.0__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
- mindspore/Newtonsoft.Json.dll +0 -0
- mindspore/__init__.py +3 -1
- mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +50 -9
- mindspore/_extends/parse/compile_config.py +41 -0
- mindspore/_extends/parse/parser.py +9 -7
- mindspore/_extends/parse/standard_method.py +52 -14
- mindspore/_extends/pijit/pijit_func_white_list.py +350 -24
- mindspore/amp.py +24 -10
- mindspore/atlprov.dll +0 -0
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/c1.dll +0 -0
- mindspore/c1xx.dll +0 -0
- mindspore/c2.dll +0 -0
- mindspore/common/__init__.py +6 -4
- mindspore/common/_pijit_context.py +190 -0
- mindspore/common/_register_for_tensor.py +2 -1
- mindspore/common/_tensor_overload.py +139 -0
- mindspore/common/api.py +102 -87
- mindspore/common/dump.py +5 -6
- mindspore/common/generator.py +1 -7
- mindspore/common/hook_handle.py +14 -26
- mindspore/common/mindir_util.py +2 -2
- mindspore/common/parameter.py +46 -13
- mindspore/common/recompute.py +39 -9
- mindspore/common/sparse_tensor.py +7 -3
- mindspore/common/tensor.py +209 -29
- mindspore/communication/__init__.py +1 -1
- mindspore/communication/_comm_helper.py +38 -3
- mindspore/communication/comm_func.py +310 -55
- mindspore/communication/management.py +14 -14
- mindspore/context.py +123 -22
- mindspore/dataset/__init__.py +1 -1
- mindspore/dataset/audio/__init__.py +1 -1
- mindspore/dataset/core/config.py +7 -0
- mindspore/dataset/core/validator_helpers.py +7 -0
- mindspore/dataset/engine/cache_client.py +1 -1
- mindspore/dataset/engine/datasets.py +72 -44
- mindspore/dataset/engine/datasets_audio.py +7 -7
- mindspore/dataset/engine/datasets_standard_format.py +53 -3
- mindspore/dataset/engine/datasets_text.py +20 -20
- mindspore/dataset/engine/datasets_user_defined.py +174 -104
- mindspore/dataset/engine/datasets_vision.py +33 -33
- mindspore/dataset/engine/iterators.py +29 -0
- mindspore/dataset/engine/obs/util.py +7 -0
- mindspore/dataset/engine/queue.py +114 -60
- mindspore/dataset/engine/serializer_deserializer.py +2 -2
- mindspore/dataset/engine/validators.py +34 -14
- mindspore/dataset/text/__init__.py +1 -4
- mindspore/dataset/transforms/__init__.py +0 -3
- mindspore/dataset/utils/line_reader.py +2 -0
- mindspore/dataset/vision/__init__.py +1 -4
- mindspore/dataset/vision/utils.py +1 -1
- mindspore/dataset/vision/validators.py +2 -1
- mindspore/dnnl.dll +0 -0
- mindspore/dpcmi.dll +0 -0
- mindspore/{nn/extend → experimental/es}/__init__.py +4 -11
- mindspore/experimental/es/embedding_service.py +883 -0
- mindspore/{nn/layer → experimental/es}/embedding_service_layer.py +218 -30
- mindspore/experimental/llm_boost/__init__.py +21 -0
- mindspore/{nn/extend/layer → experimental/llm_boost/atb}/__init__.py +4 -8
- mindspore/experimental/llm_boost/atb/boost_base.py +211 -0
- mindspore/experimental/llm_boost/atb/llama_boost.py +115 -0
- mindspore/experimental/llm_boost/atb/qwen_boost.py +101 -0
- mindspore/experimental/llm_boost/register.py +129 -0
- mindspore/experimental/llm_boost/utils.py +31 -0
- mindspore/experimental/optim/adamw.py +85 -0
- mindspore/experimental/optim/optimizer.py +3 -0
- mindspore/hal/__init__.py +3 -3
- mindspore/hal/contiguous_tensors_handle.py +175 -0
- mindspore/hal/stream.py +18 -0
- mindspore/include/api/model_group.h +13 -1
- mindspore/include/api/types.h +10 -10
- mindspore/include/dataset/config.h +2 -2
- mindspore/include/dataset/constants.h +2 -2
- mindspore/include/dataset/execute.h +2 -2
- mindspore/include/dataset/vision.h +4 -0
- mindspore/jpeg62.dll +0 -0
- mindspore/log.py +1 -1
- mindspore/mindrecord/filewriter.py +68 -51
- mindspore/mindspore_backend.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_np_dtype.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/mint/__init__.py +495 -46
- mindspore/mint/distributed/__init__.py +31 -0
- mindspore/mint/distributed/distributed.py +254 -0
- mindspore/mint/nn/__init__.py +266 -21
- mindspore/mint/nn/functional.py +125 -19
- mindspore/mint/nn/layer/__init__.py +39 -0
- mindspore/mint/nn/layer/activation.py +133 -0
- mindspore/mint/nn/layer/normalization.py +477 -0
- mindspore/mint/nn/layer/pooling.py +110 -0
- mindspore/mint/optim/adamw.py +28 -7
- mindspore/mint/special/__init__.py +63 -0
- mindspore/msobj140.dll +0 -0
- mindspore/mspdb140.dll +0 -0
- mindspore/mspdbcore.dll +0 -0
- mindspore/mspdbst.dll +0 -0
- mindspore/mspft140.dll +0 -0
- mindspore/msvcdis140.dll +0 -0
- mindspore/msvcp140_1.dll +0 -0
- mindspore/msvcp140_2.dll +0 -0
- mindspore/msvcp140_atomic_wait.dll +0 -0
- mindspore/msvcp140_codecvt_ids.dll +0 -0
- mindspore/multiprocessing/__init__.py +2 -1
- mindspore/nn/__init__.py +0 -1
- mindspore/nn/cell.py +275 -93
- mindspore/nn/layer/activation.py +211 -44
- mindspore/nn/layer/basic.py +113 -3
- mindspore/nn/layer/embedding.py +120 -2
- mindspore/nn/layer/normalization.py +101 -5
- mindspore/nn/layer/padding.py +34 -48
- mindspore/nn/layer/pooling.py +161 -7
- mindspore/nn/layer/transformer.py +3 -3
- mindspore/nn/loss/__init__.py +2 -2
- mindspore/nn/loss/loss.py +84 -6
- mindspore/nn/optim/__init__.py +2 -1
- mindspore/nn/optim/adadelta.py +1 -1
- mindspore/nn/optim/adam.py +1 -1
- mindspore/nn/optim/lamb.py +1 -1
- mindspore/nn/optim/tft_wrapper.py +127 -0
- mindspore/nn/wrap/cell_wrapper.py +12 -23
- mindspore/nn/wrap/grad_reducer.py +5 -5
- mindspore/nn/wrap/loss_scale.py +17 -3
- mindspore/numpy/__init__.py +1 -1
- mindspore/numpy/array_creations.py +65 -68
- mindspore/numpy/array_ops.py +64 -60
- mindspore/numpy/fft.py +610 -75
- mindspore/numpy/logic_ops.py +11 -10
- mindspore/numpy/math_ops.py +85 -84
- mindspore/numpy/utils_const.py +4 -4
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/__init__.py +6 -4
- mindspore/ops/_grad_experimental/grad_comm_ops.py +47 -3
- mindspore/ops/_grad_experimental/grad_math_ops.py +0 -22
- mindspore/ops/_vmap/vmap_array_ops.py +2 -4
- mindspore/ops/_vmap/vmap_math_ops.py +17 -1
- mindspore/ops/_vmap/vmap_nn_ops.py +43 -2
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +85 -7
- mindspore/ops/auto_generate/gen_arg_dtype_cast.py +2 -0
- mindspore/ops/auto_generate/gen_extend_func.py +734 -13
- mindspore/ops/auto_generate/gen_ops_def.py +2420 -381
- mindspore/ops/auto_generate/gen_ops_prim.py +5196 -1659
- mindspore/ops/auto_generate/pyboost_inner_prim.py +176 -56
- mindspore/ops/composite/base.py +85 -48
- mindspore/ops/composite/multitype_ops/_compile_utils.py +1 -0
- mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -2
- mindspore/ops/function/__init__.py +22 -0
- mindspore/ops/function/array_func.py +490 -153
- mindspore/ops/function/debug_func.py +113 -1
- mindspore/ops/function/fft_func.py +15 -2
- mindspore/ops/function/grad/grad_func.py +3 -2
- mindspore/ops/function/math_func.py +558 -207
- mindspore/ops/function/nn_func.py +817 -383
- mindspore/ops/function/other_func.py +3 -2
- mindspore/ops/function/random_func.py +184 -8
- mindspore/ops/function/reshard_func.py +13 -11
- mindspore/ops/function/sparse_unary_func.py +1 -1
- mindspore/ops/function/vmap_func.py +3 -2
- mindspore/ops/functional.py +24 -14
- mindspore/ops/op_info_register.py +3 -3
- mindspore/ops/operations/__init__.py +6 -1
- mindspore/ops/operations/_grad_ops.py +2 -76
- mindspore/ops/operations/_infer_ops.py +1 -1
- mindspore/ops/operations/_inner_ops.py +71 -94
- mindspore/ops/operations/array_ops.py +12 -146
- mindspore/ops/operations/comm_ops.py +42 -53
- mindspore/ops/operations/custom_ops.py +83 -19
- mindspore/ops/operations/debug_ops.py +42 -10
- mindspore/ops/operations/manually_defined/_inner.py +12 -0
- mindspore/ops/operations/manually_defined/ops_def.py +265 -10
- mindspore/ops/operations/math_ops.py +12 -223
- mindspore/ops/operations/nn_ops.py +20 -114
- mindspore/ops/operations/other_ops.py +7 -4
- mindspore/ops/operations/random_ops.py +46 -1
- mindspore/ops/primitive.py +18 -6
- mindspore/ops_generate/arg_dtype_cast.py +2 -0
- mindspore/ops_generate/gen_aclnn_implement.py +11 -11
- mindspore/ops_generate/gen_constants.py +36 -0
- mindspore/ops_generate/gen_ops.py +67 -52
- mindspore/ops_generate/gen_ops_inner_prim.py +1 -1
- mindspore/ops_generate/gen_pyboost_func.py +131 -47
- mindspore/ops_generate/op_proto.py +10 -3
- mindspore/ops_generate/pyboost_utils.py +14 -1
- mindspore/ops_generate/template.py +43 -21
- mindspore/parallel/__init__.py +3 -1
- mindspore/parallel/_auto_parallel_context.py +28 -8
- mindspore/parallel/_cell_wrapper.py +83 -0
- mindspore/parallel/_parallel_serialization.py +47 -19
- mindspore/parallel/_tensor.py +81 -11
- mindspore/parallel/_utils.py +13 -1
- mindspore/parallel/algo_parameter_config.py +5 -5
- mindspore/parallel/checkpoint_transform.py +46 -39
- mindspore/parallel/cluster/process_entity/__init__.py +1 -1
- mindspore/parallel/cluster/process_entity/_api.py +31 -23
- mindspore/parallel/cluster/process_entity/_utils.py +2 -27
- mindspore/parallel/parameter_broadcast.py +3 -4
- mindspore/parallel/shard.py +162 -31
- mindspore/parallel/transform_safetensors.py +993 -0
- mindspore/pgodb140.dll +0 -0
- mindspore/pgort140.dll +0 -0
- mindspore/profiler/__init__.py +2 -1
- mindspore/profiler/common/constant.py +29 -0
- mindspore/profiler/common/registry.py +47 -0
- mindspore/profiler/common/util.py +28 -0
- mindspore/profiler/dynamic_profiler.py +694 -0
- mindspore/profiler/envprofiling.py +17 -19
- mindspore/profiler/parser/ascend_analysis/constant.py +18 -0
- mindspore/profiler/parser/ascend_analysis/file_manager.py +25 -4
- mindspore/profiler/parser/ascend_analysis/function_event.py +43 -19
- mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +31 -26
- mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +56 -10
- mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +55 -8
- mindspore/profiler/parser/ascend_analysis/path_manager.py +313 -0
- mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +27 -20
- mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +9 -2
- mindspore/profiler/parser/ascend_msprof_exporter.py +5 -4
- mindspore/profiler/parser/ascend_timeline_generator.py +27 -25
- mindspore/profiler/parser/base_timeline_generator.py +19 -25
- mindspore/profiler/parser/cpu_gpu_timeline_generator.py +25 -12
- mindspore/profiler/parser/framework_parser.py +1 -391
- mindspore/profiler/parser/gpu_analysis/__init__.py +14 -0
- mindspore/profiler/parser/gpu_analysis/function_event.py +44 -0
- mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +89 -0
- mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +72 -0
- mindspore/profiler/parser/memory_usage_parser.py +0 -154
- mindspore/profiler/parser/profiler_info.py +78 -6
- mindspore/profiler/profiler.py +153 -0
- mindspore/profiler/profiling.py +280 -412
- mindspore/rewrite/__init__.py +1 -2
- mindspore/rewrite/common/namespace.py +4 -4
- mindspore/rewrite/symbol_tree/symbol_tree.py +3 -3
- mindspore/run_check/_check_version.py +36 -103
- mindspore/safeguard/rewrite_obfuscation.py +591 -247
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tbbmalloc.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/__init__.py +4 -3
- mindspore/train/_utils.py +28 -2
- mindspore/train/amp.py +171 -53
- mindspore/train/callback/__init__.py +2 -2
- mindspore/train/callback/_callback.py +4 -4
- mindspore/train/callback/_checkpoint.py +85 -22
- mindspore/train/callback/_cluster_monitor.py +1 -1
- mindspore/train/callback/_flops_collector.py +1 -0
- mindspore/train/callback/_loss_monitor.py +3 -3
- mindspore/train/callback/_on_request_exit.py +134 -31
- mindspore/train/callback/_summary_collector.py +5 -5
- mindspore/train/callback/_tft_register.py +352 -0
- mindspore/train/dataset_helper.py +7 -3
- mindspore/train/metrics/metric.py +3 -3
- mindspore/train/metrics/roc.py +4 -4
- mindspore/train/mind_ir_pb2.py +44 -39
- mindspore/train/model.py +134 -58
- mindspore/train/serialization.py +336 -112
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/__init__.py +21 -0
- mindspore/utils/utils.py +60 -0
- mindspore/vcmeta.dll +0 -0
- mindspore/vcruntime140.dll +0 -0
- mindspore/vcruntime140_1.dll +0 -0
- mindspore/version.py +1 -1
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/METADATA +6 -2
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/RECORD +281 -275
- mindspore/include/c_api/ms/abstract.h +0 -67
- mindspore/include/c_api/ms/attribute.h +0 -197
- mindspore/include/c_api/ms/base/handle_types.h +0 -43
- mindspore/include/c_api/ms/base/macros.h +0 -32
- mindspore/include/c_api/ms/base/status.h +0 -33
- mindspore/include/c_api/ms/base/types.h +0 -283
- mindspore/include/c_api/ms/context.h +0 -102
- mindspore/include/c_api/ms/graph.h +0 -160
- mindspore/include/c_api/ms/node.h +0 -606
- mindspore/include/c_api/ms/tensor.h +0 -161
- mindspore/include/c_api/ms/value.h +0 -84
- mindspore/mindspore_shared_lib.dll +0 -0
- mindspore/nn/extend/basic.py +0 -140
- mindspore/nn/extend/embedding.py +0 -143
- mindspore/nn/extend/layer/normalization.py +0 -109
- mindspore/nn/extend/pooling.py +0 -117
- mindspore/nn/layer/embedding_service.py +0 -531
- mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +0 -93
- mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +0 -66
- mindspore/ops/extend/__init__.py +0 -53
- mindspore/ops/extend/array_func.py +0 -218
- mindspore/ops/extend/math_func.py +0 -76
- mindspore/ops/extend/nn_func.py +0 -308
- mindspore/ops/silent_check.py +0 -162
- mindspore/profiler/parser/msadvisor_analyzer.py +0 -82
- mindspore/profiler/parser/msadvisor_parser.py +0 -240
- mindspore/train/callback/_mindio_ttp.py +0 -443
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/WHEEL +0 -0
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/entry_points.txt +0 -0
- {mindspore-2.3.0.dist-info → mindspore-2.4.0.dist-info}/top_level.txt +0 -0
|
@@ -22,12 +22,12 @@ from collections import defaultdict
|
|
|
22
22
|
import numpy as np
|
|
23
23
|
import mindspore as ms
|
|
24
24
|
from mindspore.common import dtype as mstype
|
|
25
|
-
from mindspore.parallel._utils import _is_in_auto_parallel_mode
|
|
25
|
+
from mindspore.parallel._utils import _is_in_auto_parallel_mode, _get_pipeline_stages
|
|
26
26
|
from mindspore.parallel._parallel_serialization import _rank_list_for_transform_parallel_checkpoint, \
|
|
27
27
|
_transform_parallel_checkpoint, _get_device_num_from_strategy, _make_dir, \
|
|
28
28
|
_extract_layout_map, _extract_src_dst_layout_map, _parameter_not_in_local_stage, _extract_pipeline_stage_num, \
|
|
29
29
|
_merge_protobuf_strategy, _merge_json_strategy, _extract_src_dst_layout_map_by_src
|
|
30
|
-
|
|
30
|
+
from mindspore.parallel.transform_safetensors import _transform_safetensors, _collect_safetensor_files
|
|
31
31
|
|
|
32
32
|
__all__ = ["merge_pipeline_strategys", "rank_list_for_transform", "transform_checkpoint_by_rank",
|
|
33
33
|
"transform_checkpoints", "sync_pipeline_shared_parameters", "load_segmented_checkpoints"]
|
|
@@ -37,7 +37,7 @@ def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
|
|
|
37
37
|
"""
|
|
38
38
|
Merge parallel strategy between all pipeline stages in pipeline parallel mode.
|
|
39
39
|
For more details about converting distributed Checkpoint, please refer to
|
|
40
|
-
`Model Transformation <https://www.mindspore.cn/
|
|
40
|
+
`Model Transformation <https://www.mindspore.cn/docs/en/master/model_train/parallel/model_transformation.html>`_.
|
|
41
41
|
|
|
42
42
|
Note:
|
|
43
43
|
Strategy file of each pipeline stage should be included in src_strategy_dirs.
|
|
@@ -72,12 +72,11 @@ def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
|
|
|
72
72
|
_merge_json_strategy(src_strategy_files_json, dst_strategy_file)
|
|
73
73
|
|
|
74
74
|
|
|
75
|
-
|
|
76
75
|
def rank_list_for_transform(rank_id, src_strategy_file=None, dst_strategy_file=None):
|
|
77
76
|
"""
|
|
78
77
|
List of original distributed checkpoint rank index for obtaining the target checkpoint of a rank_id during the
|
|
79
78
|
distributed checkpoint conversion. For more details about converting distributed Checkpoint, please refer to
|
|
80
|
-
`Model Transformation <https://www.mindspore.cn/
|
|
79
|
+
`Model Transformation <https://www.mindspore.cn/docs/en/master/model_train/parallel/model_transformation.html>`_.
|
|
81
80
|
|
|
82
81
|
Args:
|
|
83
82
|
rank_id (int): The rank of which distributed checkpoint needs to be obtained after conversion.
|
|
@@ -132,7 +131,9 @@ def rank_list_for_transform(rank_id, src_strategy_file=None, dst_strategy_file=N
|
|
|
132
131
|
src_rank_id_start = src_pipeline_stage_id * src_stage_device_num
|
|
133
132
|
result_set.update([src_rank_id_start + rank for rank in needed_rank_list_in_local_stage])
|
|
134
133
|
handled_pipeline_stage.append(src_pipeline_stage_id)
|
|
135
|
-
|
|
134
|
+
result_list = list(result_set)
|
|
135
|
+
result_list.sort(reverse=True)
|
|
136
|
+
return list(result_list)
|
|
136
137
|
|
|
137
138
|
|
|
138
139
|
def transform_checkpoint_by_rank(rank_id, checkpoint_files_map, save_checkpoint_file_name,
|
|
@@ -140,7 +141,7 @@ def transform_checkpoint_by_rank(rank_id, checkpoint_files_map, save_checkpoint_
|
|
|
140
141
|
"""
|
|
141
142
|
Transform distributed checkpoint from source sharding strategy to destination sharding strategy by rank
|
|
142
143
|
for a network. For more details about converting distributed Checkpoint, please refer to
|
|
143
|
-
`Model Transformation <https://www.mindspore.cn/
|
|
144
|
+
`Model Transformation <https://www.mindspore.cn/docs/en/master/model_train/parallel/model_transformation.html>`_.
|
|
144
145
|
|
|
145
146
|
Args:
|
|
146
147
|
rank_id (int): The rank of which distributed checkpoint needs to be obtained after conversion.
|
|
@@ -232,7 +233,7 @@ def _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckp
|
|
|
232
233
|
param_attr_dict = defaultdict(dict)
|
|
233
234
|
param_type_dict = defaultdict(dict)
|
|
234
235
|
src_strategy_list, dst_strategy_list, stage_id = _extract_src_dst_layout_map_by_src(src_strategy_file, \
|
|
235
|
-
|
|
236
|
+
dst_strategy_file)
|
|
236
237
|
src_stage_device_num = np.prod(src_strategy_list.get(list(src_strategy_list.keys())[0])[0]) if src_strategy_list \
|
|
237
238
|
is not None else 1
|
|
238
239
|
dst_stage_device_num = np.prod(dst_strategy_list.get(list(dst_strategy_list.keys())[0])[0]) if dst_strategy_list \
|
|
@@ -357,29 +358,35 @@ def _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix
|
|
|
357
358
|
|
|
358
359
|
|
|
359
360
|
def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
|
|
360
|
-
dst_strategy_file=None):
|
|
361
|
+
dst_strategy_file=None, process_num=1, output_format="ckpt"):
|
|
361
362
|
"""
|
|
362
363
|
Transform distributed checkpoint from source sharding strategy to destination sharding strategy for a rank.
|
|
363
364
|
For more details about converting distributed Checkpoint, please refer to
|
|
364
|
-
`Model Transformation <https://www.mindspore.cn/
|
|
365
|
+
`Model Transformation <https://www.mindspore.cn/docs/en/master/model_train/parallel/model_transformation.html>`_.
|
|
365
366
|
|
|
366
367
|
Note:
|
|
367
368
|
The `src_checkpoints_dir` directory structure should be organized like "src_checkpoints_dir/rank_0/a.ckpt", the
|
|
368
369
|
rank number should be set to a subdirectory and the checkpoint file is stored in this subdirectory. If multiple
|
|
369
370
|
files exist in a rank directory, the last file in the lexicgraphic order would be selected.
|
|
370
371
|
|
|
372
|
+
The number of multiprocess settings is related to the size of the host, and it is not recommended to set it
|
|
373
|
+
too large, otherwise it may cause freezing.
|
|
374
|
+
|
|
371
375
|
Args:
|
|
372
376
|
src_checkpoints_dir (str): The source checkpoints directory.
|
|
373
377
|
dst_checkpoints_dir (str): The destination checkpoints directory to save the converted checkpoints.
|
|
374
378
|
ckpt_prefix (str): The destination checkpoint name prefix.
|
|
375
|
-
src_strategy_file (str): Name of source sharding strategy file which saved by
|
|
379
|
+
src_strategy_file (str, optional): Name of source sharding strategy file which saved by
|
|
376
380
|
'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
|
|
377
381
|
when the 'src_strategy_file' is None, it means that the source sharding strategy is
|
|
378
382
|
without any sharing for each parameter. Default:None.
|
|
379
|
-
dst_strategy_file (str): Name of destination sharding strategy file which saved by
|
|
383
|
+
dst_strategy_file (str, optional): Name of destination sharding strategy file which saved by
|
|
380
384
|
'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
|
|
381
385
|
when the 'dst_strategy_file' is None, it means that the destination sharding strategy
|
|
382
386
|
is without any sharing for each parameter. Default:None.
|
|
387
|
+
process_num (int, optional): Number of processes to use for parallel processing. Defaults: 1.
|
|
388
|
+
output_format (str, optional): Control the format of the output checkpoint after conversion.
|
|
389
|
+
It can be set to either "ckpt" or "safetensors". Default: "ckpt".
|
|
383
390
|
|
|
384
391
|
Raises:
|
|
385
392
|
ValueError: `src_strategy_file` or `dst_strategy_file` is incorrect.
|
|
@@ -393,6 +400,21 @@ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
|
|
|
393
400
|
... "./src_strategy.ckpt", "./dst_strategy.ckpt")
|
|
394
401
|
|
|
395
402
|
"""
|
|
403
|
+
all_safetensor_files_map = _collect_safetensor_files(src_checkpoints_dir)
|
|
404
|
+
all_ckpt_files_map = _collect_safetensor_files(src_checkpoints_dir, format='ckpt')
|
|
405
|
+
if all_safetensor_files_map and all_ckpt_files_map:
|
|
406
|
+
raise ValueError("For 'transform_checkpoints', the 'src_checkpoints_dir' cannot contain "
|
|
407
|
+
"both ckpt file and safetensors file simultaneously")
|
|
408
|
+
if all_safetensor_files_map and not all_ckpt_files_map:
|
|
409
|
+
_transform_safetensors(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file,
|
|
410
|
+
dst_strategy_file, process_num, output_format)
|
|
411
|
+
return
|
|
412
|
+
if not all_safetensor_files_map and not all_ckpt_files_map:
|
|
413
|
+
raise ValueError("For 'transform_checkpoints', the 'src_checkpoints_dir' can not be empty.")
|
|
414
|
+
if all_ckpt_files_map and not all_safetensor_files_map and output_format == 'safetensors':
|
|
415
|
+
raise ValueError("For 'transform_checkpoints', 'output_format' can not be 'safetensors' "
|
|
416
|
+
"when 'src_checkpoints_dir' only contains ckpt file.")
|
|
417
|
+
|
|
396
418
|
if not os.path.isdir(src_checkpoints_dir):
|
|
397
419
|
raise NotADirectoryError("src_checkpoints_dir {} is not a directory.".format(src_checkpoints_dir))
|
|
398
420
|
_make_dir(dst_checkpoints_dir, "path")
|
|
@@ -419,7 +441,7 @@ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
|
|
|
419
441
|
layout_is_passed = src_layout_map and dst_layout_map
|
|
420
442
|
|
|
421
443
|
if layout_is_passed and pipeline_stage_num == 1 and dst_stage_num == 1 and \
|
|
422
|
-
|
|
444
|
+
src_param_keys.issubset(dst_param_keys) and len(src_param_keys) < len(dst_param_keys):
|
|
423
445
|
ms.log.info("Transform checkpoint by every pipeline stage.")
|
|
424
446
|
_transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
|
|
425
447
|
src_strategy_file, dst_strategy_file)
|
|
@@ -442,31 +464,13 @@ def _sync_params(name, param, layout):
|
|
|
442
464
|
is_send = layout[9]
|
|
443
465
|
peer_rank = layout[10]
|
|
444
466
|
sr_tag = layout[11]
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
self.ret = ms.Tensor([0])
|
|
453
|
-
|
|
454
|
-
from mindspore.ops import Send, Receive
|
|
455
|
-
if self.is_send:
|
|
456
|
-
self.send = Send(sr_tag=sr_tag, dest_rank=peer_rank)
|
|
457
|
-
else:
|
|
458
|
-
self.receive = Receive(sr_tag=sr_tag, src_rank=peer_rank, shape=param.shape, dtype=param.dtype)
|
|
459
|
-
|
|
460
|
-
def construct(self):
|
|
461
|
-
if self.is_send:
|
|
462
|
-
out = self.send(self.param)
|
|
463
|
-
return ms.ops.functional.depend(self.ret, out)
|
|
464
|
-
|
|
465
|
-
self.param = self.receive(self.ret)
|
|
466
|
-
return ms.ops.functional.depend(self.ret, self.param)
|
|
467
|
-
|
|
468
|
-
sync_net = SharedParameterSyncCell(param, is_send, peer_rank, sr_tag)
|
|
469
|
-
sync_net()
|
|
467
|
+
if is_send:
|
|
468
|
+
ms.ops.Send(sr_tag=sr_tag, dest_rank=peer_rank)(param)
|
|
469
|
+
else:
|
|
470
|
+
param.assign_value(ms.ops.Receive(sr_tag=sr_tag,
|
|
471
|
+
src_rank=peer_rank,
|
|
472
|
+
shape=param.shape,
|
|
473
|
+
dtype=param.dtype)(param))
|
|
470
474
|
|
|
471
475
|
|
|
472
476
|
def sync_pipeline_shared_parameters(net):
|
|
@@ -489,7 +493,7 @@ def sync_pipeline_shared_parameters(net):
|
|
|
489
493
|
Before running the following examples, you need to configure the communication environment variables.
|
|
490
494
|
|
|
491
495
|
For the Ascend device, users need to write a dynamic cluster startup script, please see the `Dynamic Cluster
|
|
492
|
-
Startup <https://www.mindspore.cn/
|
|
496
|
+
Startup <https://www.mindspore.cn/docs/en/master/model_train/parallel/dynamic_cluster.html>`_ .
|
|
493
497
|
|
|
494
498
|
>>> import numpy as np
|
|
495
499
|
>>> import mindspore as ms
|
|
@@ -562,6 +566,9 @@ def sync_pipeline_shared_parameters(net):
|
|
|
562
566
|
"but got {}.".format(type(net)))
|
|
563
567
|
raise TypeError(msg)
|
|
564
568
|
|
|
569
|
+
if _get_pipeline_stages() < 2:
|
|
570
|
+
return
|
|
571
|
+
|
|
565
572
|
layout_dict = net.parameter_layout_dict
|
|
566
573
|
if _is_in_auto_parallel_mode() and not layout_dict:
|
|
567
574
|
from mindspore.common.api import _get_parameter_layout
|
|
@@ -15,4 +15,4 @@
|
|
|
15
15
|
"""Interfaces for ms_run"""
|
|
16
16
|
from ._api import _Node, _MetaServerNode, _ComputeGraphNode, _ProcessManager
|
|
17
17
|
|
|
18
|
-
from ._utils import _generate_cmd, _generate_url, _is_local_ip, _send_scale_num
|
|
18
|
+
from ._utils import _generate_cmd, _generate_url, _is_local_ip, _send_scale_num
|
|
@@ -19,7 +19,7 @@ import sys
|
|
|
19
19
|
import subprocess
|
|
20
20
|
import mindspore.log as logger
|
|
21
21
|
from ._utils import _generate_cmd_args_list, _generate_cmd_args_list_with_core, _generate_url,\
|
|
22
|
-
_is_local_ip, _send_scale_num
|
|
22
|
+
_is_local_ip, _send_scale_num
|
|
23
23
|
|
|
24
24
|
class _Node:
|
|
25
25
|
"""
|
|
@@ -212,6 +212,7 @@ class _ProcessManager:
|
|
|
212
212
|
raise ValueError(f"Simulation level is set, worker_num must be 1, but got {self.worker_num}.")
|
|
213
213
|
|
|
214
214
|
for i in range(self.local_worker_num):
|
|
215
|
+
os.environ["DEVICE_ID"] = str(i)
|
|
215
216
|
node_id, log_name = self._get_node_id_and_log_path(i)
|
|
216
217
|
if node_id is None:
|
|
217
218
|
logger.warning(f"Rank ids will be assigned automatically, "
|
|
@@ -241,19 +242,6 @@ class _ProcessManager:
|
|
|
241
242
|
process = cgn.run()
|
|
242
243
|
self.cgn_processes.append(process)
|
|
243
244
|
|
|
244
|
-
def heartbeat_with_scheduler(self):
|
|
245
|
-
"""
|
|
246
|
-
Sends a heartbeat to the scheduler and updates the worker_num and local_worker_num.
|
|
247
|
-
|
|
248
|
-
Returns:
|
|
249
|
-
bool: True if the network has changed, False otherwise.
|
|
250
|
-
|
|
251
|
-
"""
|
|
252
|
-
network_changed, worker_num, local_worker_num = _get_status_and_params(self.scheduler_url)
|
|
253
|
-
self.worker_num = worker_num
|
|
254
|
-
self.local_worker_num = local_worker_num
|
|
255
|
-
return network_changed
|
|
256
|
-
|
|
257
245
|
def join_processes(self):
|
|
258
246
|
"""
|
|
259
247
|
Join all processes to stop.
|
|
@@ -261,11 +249,31 @@ class _ProcessManager:
|
|
|
261
249
|
so that understandable root cause of exception could be returned.
|
|
262
250
|
"""
|
|
263
251
|
has_exception = False
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
if
|
|
267
|
-
|
|
268
|
-
|
|
252
|
+
success_cgn_processes = set()
|
|
253
|
+
while True:
|
|
254
|
+
# Traversal all workers and kill immediately if any exception happens.
|
|
255
|
+
for p in self.cgn_processes:
|
|
256
|
+
ret_code = p.poll()
|
|
257
|
+
if ret_code is None:
|
|
258
|
+
# This means the process is still running, poll next process.
|
|
259
|
+
continue
|
|
260
|
+
elif ret_code != 0:
|
|
261
|
+
has_exception = True
|
|
262
|
+
logger.error(f"Worker process {p.pid} exit with exception.")
|
|
263
|
+
break
|
|
264
|
+
else:
|
|
265
|
+
success_cgn_processes.add(p)
|
|
266
|
+
|
|
267
|
+
if has_exception:
|
|
268
|
+
logger.warning("There's worker exits with exception, kill all other workers.")
|
|
269
|
+
for p in self.cgn_processes:
|
|
270
|
+
if p.poll() is None:
|
|
271
|
+
p.kill()
|
|
272
|
+
break
|
|
273
|
+
elif len(success_cgn_processes) == len(self.cgn_processes):
|
|
274
|
+
logger.info("All workers successfully exit!")
|
|
275
|
+
break
|
|
276
|
+
|
|
269
277
|
|
|
270
278
|
if self.msn_process:
|
|
271
279
|
self.msn_process.wait()
|
|
@@ -335,10 +343,10 @@ class _ProcessManager:
|
|
|
335
343
|
time_out_node_log = re.findall(r"node: .* is timed out", scheduler_log)
|
|
336
344
|
|
|
337
345
|
# Filter out node ids of the processes which exit abnormally.
|
|
338
|
-
def node_id_splitter(
|
|
339
|
-
return re.split(" is timed out", re.split("node: ",
|
|
340
|
-
for
|
|
341
|
-
time_out_node_ids.append(node_id_splitter(
|
|
346
|
+
def node_id_splitter(node_id):
|
|
347
|
+
return re.split(" is timed out", re.split("node: ", node_id)[1])[0]
|
|
348
|
+
for node_id in time_out_node_log:
|
|
349
|
+
time_out_node_ids.append(node_id_splitter(node_id))
|
|
342
350
|
logger.error(f"Time out nodes are {time_out_node_ids}")
|
|
343
351
|
|
|
344
352
|
os.system(f"grep -rn -E 'ERROR|CRITICAL|Traceback|Error' -C 5 {self.log_dir}")
|
|
@@ -16,7 +16,6 @@
|
|
|
16
16
|
import os
|
|
17
17
|
import json
|
|
18
18
|
import socket
|
|
19
|
-
import requests
|
|
20
19
|
import mindspore.log as logger
|
|
21
20
|
|
|
22
21
|
def _generate_cmd(cmd, cmd_args, output_name):
|
|
@@ -25,7 +24,7 @@ def _generate_cmd(cmd, cmd_args, output_name):
|
|
|
25
24
|
edirecting the output to a log file.
|
|
26
25
|
|
|
27
26
|
"""
|
|
28
|
-
if cmd not in ['python', 'pytest']:
|
|
27
|
+
if cmd not in ['python', 'pytest', 'python3']:
|
|
29
28
|
# If user don't set binary file name, defaulty use 'python' to launch the job.
|
|
30
29
|
command = f"python {cmd} {' '.join(cmd_args)} > {output_name} 2>&1 &"
|
|
31
30
|
else:
|
|
@@ -99,28 +98,4 @@ def _send_scale_num(url, scale_num):
|
|
|
99
98
|
Send an HTTP request to a specified URL, informing scale_num.
|
|
100
99
|
|
|
101
100
|
"""
|
|
102
|
-
|
|
103
|
-
response = requests.post(url, data={"scale_num": scale_num}, timeout=100)
|
|
104
|
-
response.raise_for_status()
|
|
105
|
-
response_data = response.json()
|
|
106
|
-
response_bool = bool(response_data)
|
|
107
|
-
return response_bool
|
|
108
|
-
except requests.exceptions.RequestException:
|
|
109
|
-
return None
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def _get_status_and_params(url):
|
|
113
|
-
"""
|
|
114
|
-
Send an HTTP request to a specified URL to query status and retrieve partial parameters.
|
|
115
|
-
|
|
116
|
-
"""
|
|
117
|
-
try:
|
|
118
|
-
response = requests.get(url, timeout=100)
|
|
119
|
-
response.raise_for_status()
|
|
120
|
-
response_data = response.json()
|
|
121
|
-
network_changed = response_data.get("network_changed")
|
|
122
|
-
worker_num = response_data.get("worker_num")
|
|
123
|
-
local_worker_num = response_data.get("local_worker_num")
|
|
124
|
-
return network_changed, worker_num, local_worker_num
|
|
125
|
-
except requests.exceptions.RequestException:
|
|
126
|
-
return None
|
|
101
|
+
return ""
|
|
@@ -18,6 +18,8 @@ from __future__ import absolute_import
|
|
|
18
18
|
__all__ = ["parameter_broadcast"]
|
|
19
19
|
|
|
20
20
|
import numpy as np
|
|
21
|
+
import mindspore as ms
|
|
22
|
+
from mindspore.communication import get_rank, create_group, get_group_size
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
def parameter_broadcast(net, layout, cur_rank=0, initial_rank=0):
|
|
@@ -104,9 +106,6 @@ def parameter_broadcast(net, layout, cur_rank=0, initial_rank=0):
|
|
|
104
106
|
"""
|
|
105
107
|
if not layout:
|
|
106
108
|
return
|
|
107
|
-
import mindspore as ms
|
|
108
|
-
from mindspore import Tensor
|
|
109
|
-
from mindspore.communication import get_rank, create_group, get_group_size
|
|
110
109
|
from mindspore.train._utils import get_parameter_redundancy, remove_param_redundancy
|
|
111
110
|
from mindspore.nn.wrap.cell_wrapper import AllreduceGraph
|
|
112
111
|
origin_parallel_mode = ms.get_auto_parallel_context("parallel_mode")
|
|
@@ -143,7 +142,7 @@ def parameter_broadcast(net, layout, cur_rank=0, initial_rank=0):
|
|
|
143
142
|
raise ValueError(f"For parameter broadcast, the param: {param} can not be found.")
|
|
144
143
|
real_param = net_param_dict[param]
|
|
145
144
|
if param not in single_params[cur_rank]:
|
|
146
|
-
real_param.set_data(Tensor(np.zeros(real_param.shape), dtype=real_param.dtype))
|
|
145
|
+
real_param.set_data(ms.Tensor(np.zeros(real_param.shape), dtype=real_param.dtype))
|
|
147
146
|
allreduce_input.append(real_param)
|
|
148
147
|
if not allreduce_input:
|
|
149
148
|
continue
|