mindspore 2.2.11__cp39-cp39-win_amd64.whl → 2.2.14__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/Microsoft.VisualStudio.Telemetry.dll +0 -0
- mindspore/Newtonsoft.Json.dll +0 -0
- mindspore/__init__.py +2 -1
- mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
- mindspore/atlprov.dll +0 -0
- mindspore/c1.dll +0 -0
- mindspore/c1xx.dll +0 -0
- mindspore/c2.dll +0 -0
- mindspore/common/tensor.py +0 -2
- mindspore/communication/management.py +3 -0
- mindspore/context.py +34 -4
- mindspore/dataset/engine/datasets.py +23 -0
- mindspore/dataset/engine/validators.py +1 -1
- mindspore/dataset/vision/py_transforms_util.py +2 -2
- mindspore/dnnl.dll +0 -0
- mindspore/dpcmi.dll +0 -0
- mindspore/experimental/optim/lr_scheduler.py +5 -6
- mindspore/jpeg62.dll +0 -0
- mindspore/mindrecord/tools/cifar100_to_mr.py +49 -57
- mindspore/mindrecord/tools/cifar10_to_mr.py +46 -55
- mindspore/mindrecord/tools/csv_to_mr.py +3 -8
- mindspore/mindrecord/tools/mnist_to_mr.py +4 -9
- mindspore/mindrecord/tools/tfrecord_to_mr.py +1 -4
- mindspore/mindspore_backend.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_shared_lib.dll +0 -0
- mindspore/msobj140.dll +0 -0
- mindspore/mspdb140.dll +0 -0
- mindspore/mspdbcore.dll +0 -0
- mindspore/mspdbst.dll +0 -0
- mindspore/mspft140.dll +0 -0
- mindspore/msvcdis140.dll +0 -0
- mindspore/msvcp140_1.dll +0 -0
- mindspore/msvcp140_2.dll +0 -0
- mindspore/msvcp140_atomic_wait.dll +0 -0
- mindspore/msvcp140_codecvt_ids.dll +0 -0
- mindspore/nn/layer/activation.py +1 -1
- mindspore/nn/layer/embedding.py +2 -2
- mindspore/nn/loss/loss.py +1 -1
- mindspore/nn/optim/ada_grad.py +2 -2
- mindspore/nn/optim/sgd.py +3 -2
- mindspore/numpy/math_ops.py +1 -1
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/__init__.py +3 -0
- mindspore/ops/_grad_experimental/grad_array_ops.py +0 -31
- mindspore/ops/_grad_experimental/grad_comm_ops.py +4 -2
- mindspore/ops/_grad_experimental/grad_inner_ops.py +8 -0
- mindspore/ops/_grad_experimental/grad_math_ops.py +37 -17
- mindspore/ops/_op_impl/aicpu/__init__.py +1 -0
- mindspore/ops/_op_impl/aicpu/generate_eod_mask.py +38 -0
- mindspore/ops/function/array_func.py +6 -5
- mindspore/ops/function/debug_func.py +1 -1
- mindspore/ops/function/linalg_func.py +21 -11
- mindspore/ops/function/math_func.py +3 -0
- mindspore/ops/function/nn_func.py +13 -11
- mindspore/ops/function/parameter_func.py +2 -0
- mindspore/ops/function/sparse_unary_func.py +2 -2
- mindspore/ops/function/vmap_func.py +1 -0
- mindspore/ops/operations/_embedding_cache_ops.py +1 -1
- mindspore/ops/operations/_inner_ops.py +56 -1
- mindspore/ops/operations/_quant_ops.py +4 -4
- mindspore/ops/operations/_rl_inner_ops.py +1 -1
- mindspore/ops/operations/array_ops.py +15 -4
- mindspore/ops/operations/custom_ops.py +1 -1
- mindspore/ops/operations/debug_ops.py +1 -1
- mindspore/ops/operations/image_ops.py +3 -3
- mindspore/ops/operations/inner_ops.py +49 -0
- mindspore/ops/operations/math_ops.py +62 -0
- mindspore/ops/operations/nn_ops.py +7 -3
- mindspore/ops/operations/random_ops.py +2 -0
- mindspore/ops/operations/sparse_ops.py +4 -4
- mindspore/ops/silent_check.py +162 -0
- mindspore/parallel/__init__.py +3 -2
- mindspore/parallel/_auto_parallel_context.py +82 -3
- mindspore/parallel/_parallel_serialization.py +34 -2
- mindspore/parallel/_tensor.py +3 -1
- mindspore/parallel/_transformer/transformer.py +8 -8
- mindspore/parallel/checkpoint_transform.py +191 -45
- mindspore/pgodb140.dll +0 -0
- mindspore/pgort140.dll +0 -0
- mindspore/profiler/parser/ascend_cluster_generator.py +111 -0
- mindspore/profiler/parser/ascend_communicate_generator.py +315 -0
- mindspore/profiler/parser/ascend_flops_generator.py +8 -2
- mindspore/profiler/parser/ascend_fpbp_generator.py +8 -2
- mindspore/profiler/parser/ascend_hccl_generator.py +2 -2
- mindspore/profiler/parser/ascend_msprof_exporter.py +30 -6
- mindspore/profiler/parser/ascend_msprof_generator.py +16 -5
- mindspore/profiler/parser/ascend_op_generator.py +15 -7
- mindspore/profiler/parser/ascend_timeline_generator.py +5 -2
- mindspore/profiler/parser/base_timeline_generator.py +11 -3
- mindspore/profiler/parser/cpu_gpu_timeline_generator.py +2 -1
- mindspore/profiler/parser/framework_parser.py +8 -2
- mindspore/profiler/parser/memory_usage_parser.py +8 -2
- mindspore/profiler/parser/minddata_analyzer.py +8 -2
- mindspore/profiler/parser/minddata_parser.py +1 -1
- mindspore/profiler/parser/msadvisor_analyzer.py +4 -2
- mindspore/profiler/parser/msadvisor_parser.py +9 -3
- mindspore/profiler/profiling.py +97 -25
- mindspore/rewrite/api/node.py +1 -1
- mindspore/rewrite/api/symbol_tree.py +2 -2
- mindspore/tbbmalloc.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/callback/_checkpoint.py +8 -8
- mindspore/train/callback/_landscape.py +2 -3
- mindspore/train/callback/_summary_collector.py +6 -7
- mindspore/train/dataset_helper.py +6 -0
- mindspore/train/model.py +17 -5
- mindspore/train/serialization.py +6 -1
- mindspore/train/summary/_writer_pool.py +1 -1
- mindspore/train/summary/summary_record.py +5 -6
- mindspore/turbojpeg.dll +0 -0
- mindspore/vcmeta.dll +0 -0
- mindspore/vcruntime140.dll +0 -0
- mindspore/vcruntime140_1.dll +0 -0
- mindspore/version.py +1 -1
- {mindspore-2.2.11.dist-info → mindspore-2.2.14.dist-info}/METADATA +1 -1
- {mindspore-2.2.11.dist-info → mindspore-2.2.14.dist-info}/RECORD +127 -123
- {mindspore-2.2.11.dist-info → mindspore-2.2.14.dist-info}/WHEEL +0 -0
- {mindspore-2.2.11.dist-info → mindspore-2.2.14.dist-info}/entry_points.txt +0 -0
- {mindspore-2.2.11.dist-info → mindspore-2.2.14.dist-info}/top_level.txt +0 -0
|
@@ -65,6 +65,19 @@ class _ParallelOptimizerConfig:
|
|
|
65
65
|
OPTIMIZER_WEIGHT_SHARD_SIZE = "optimizer_weight_shard_size"
|
|
66
66
|
|
|
67
67
|
|
|
68
|
+
class _PipelineConfig:
|
|
69
|
+
"""
|
|
70
|
+
The key of the Pipeline parallelism.
|
|
71
|
+
"""
|
|
72
|
+
PIPELINE_INTERLEAVE = "pipeline_interleave"
|
|
73
|
+
PIPELINE_SCHEDULER = "pipeline_scheduler"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class _PipelineScheduler:
|
|
77
|
+
PIPELINE_1F1B = "1f1b"
|
|
78
|
+
PIPELINE_GPIPE = "gpipe"
|
|
79
|
+
|
|
80
|
+
|
|
68
81
|
class _AutoParallelContext:
|
|
69
82
|
"""
|
|
70
83
|
_AutoParallelContext is the environment in which operations are executed
|
|
@@ -105,11 +118,11 @@ class _AutoParallelContext:
|
|
|
105
118
|
device_num (int): The device number.
|
|
106
119
|
|
|
107
120
|
Raises:
|
|
108
|
-
ValueError: If the device num is not
|
|
121
|
+
ValueError: If the device num is not a positive integer.
|
|
109
122
|
"""
|
|
110
123
|
self.check_context_handle()
|
|
111
|
-
if device_num < 1
|
|
112
|
-
raise ValueError("The context configuration parameter 'device_num' must be
|
|
124
|
+
if device_num < 1:
|
|
125
|
+
raise ValueError("The context configuration parameter 'device_num' must be a positive integer, "
|
|
113
126
|
"but got the value of device_num : {}.".format(device_num))
|
|
114
127
|
from mindspore.communication._comm_helper import _HCCL_TEST_AVAILABLE
|
|
115
128
|
self._context_handle.set_hccl_test_avaible(_HCCL_TEST_AVAILABLE)
|
|
@@ -229,6 +242,16 @@ class _AutoParallelContext:
|
|
|
229
242
|
self.check_context_handle()
|
|
230
243
|
return self._context_handle.get_pipeline_stage_split_num()
|
|
231
244
|
|
|
245
|
+
def get_pipeline_interleave(self):
|
|
246
|
+
"""Get pipeline interleave flag"""
|
|
247
|
+
self.check_context_handle()
|
|
248
|
+
return self._context_handle.get_pipeline_interleave()
|
|
249
|
+
|
|
250
|
+
def get_pipeline_scheduler(self):
|
|
251
|
+
"""Get pipeline scheduler"""
|
|
252
|
+
self.check_context_handle()
|
|
253
|
+
return self._context_handle.get_pipeline_scheduler()
|
|
254
|
+
|
|
232
255
|
def set_pipeline_segments(self, segments):
|
|
233
256
|
"""Set the segments of the pipeline"""
|
|
234
257
|
if isinstance(segments, bool) or not isinstance(segments, int):
|
|
@@ -782,6 +805,57 @@ class _AutoParallelContext:
|
|
|
782
805
|
self.check_context_handle()
|
|
783
806
|
return self._context_handle.get_enable_fold_pipeline()
|
|
784
807
|
|
|
808
|
+
def set_pipeline_config(self, pipeline_config):
|
|
809
|
+
r"""
|
|
810
|
+
Set the configuration for pipeline parallelism. The configuration provides more detailed behavior control about
|
|
811
|
+
parallel training when pipeline parallelism is enabled.
|
|
812
|
+
|
|
813
|
+
Args:
|
|
814
|
+
pipeline_config (dict): The configuration for pipeline parallelism. It supports following keys:
|
|
815
|
+
|
|
816
|
+
- pipeline_interleave(bool): Setting true enable interleave scheduler for pipeline parallelism. This
|
|
817
|
+
scheduler requires more memory but less bubble.
|
|
818
|
+
- pipeline_scheduler(string): There are two choices, "1f1b" and "gpipe". default is "1f1b"
|
|
819
|
+
|
|
820
|
+
- 1f1b: It requires less memory and bubble ratio, for it run backward pass when corresponding forward pass
|
|
821
|
+
finished.
|
|
822
|
+
- gpipe: It requires more memory and bubble ratio, for it run backward pass after all forward pass
|
|
823
|
+
finished.
|
|
824
|
+
|
|
825
|
+
Raises:
|
|
826
|
+
TypeError: If the type of `pipeline_config` is not `dict`.
|
|
827
|
+
ValueError: If the key in `pipeline_config` not in ["pipeline_interleave", "pipeline_scheduler"].
|
|
828
|
+
ValueError: If pipeline interleave is False, pipeline scheduler is not `1f1b`.
|
|
829
|
+
"""
|
|
830
|
+
self.check_context_handle()
|
|
831
|
+
|
|
832
|
+
if not isinstance(pipeline_config, dict):
|
|
833
|
+
raise TypeError("For 'set_pipeline_config', the argument 'pipeine_config' "
|
|
834
|
+
"must be dict, but got the type : {}.".format(type(pipeline_config)))
|
|
835
|
+
|
|
836
|
+
pp_interleave = _PipelineConfig.PIPELINE_INTERLEAVE
|
|
837
|
+
pp_scheduler = _PipelineConfig.PIPELINE_SCHEDULER
|
|
838
|
+
|
|
839
|
+
for config_name in pipeline_config:
|
|
840
|
+
unknown_config = []
|
|
841
|
+
if config_name not in [pp_interleave, pp_scheduler]:
|
|
842
|
+
unknown_config.append(config_name)
|
|
843
|
+
|
|
844
|
+
if unknown_config:
|
|
845
|
+
raise ValueError("Unknown config: {}".format(unknown_config))
|
|
846
|
+
|
|
847
|
+
Validator.check_bool(
|
|
848
|
+
pipeline_config[pp_interleave], pp_interleave, pp_interleave)
|
|
849
|
+
self._context_handle.set_pipeline_interleave(
|
|
850
|
+
pipeline_config[pp_interleave])
|
|
851
|
+
|
|
852
|
+
Validator.check_string(pipeline_config[pp_scheduler], [_PipelineScheduler.PIPELINE_1F1B,
|
|
853
|
+
_PipelineScheduler.PIPELINE_GPIPE])
|
|
854
|
+
if not pipeline_config[pp_interleave] and pipeline_config[pp_scheduler] != _PipelineScheduler.PIPELINE_1F1B:
|
|
855
|
+
raise ValueError(f"When pipeline_interleave is False, {pp_scheduler} is not supported")
|
|
856
|
+
|
|
857
|
+
self._context_handle.set_pipeline_scheduler(pipeline_config[pp_scheduler])
|
|
858
|
+
|
|
785
859
|
def get_enable_parallel_optimizer(self):
|
|
786
860
|
"""Get parallel optimizer flag."""
|
|
787
861
|
self.check_context_handle()
|
|
@@ -1068,6 +1142,7 @@ class _AutoParallelContext:
|
|
|
1068
1142
|
self.set_enable_all_gather_fusion(openstate)
|
|
1069
1143
|
self.set_enable_reduce_scatter_fusion(openstate)
|
|
1070
1144
|
|
|
1145
|
+
|
|
1071
1146
|
def _set_ops_strategy_json_config(type="SAVE", path="", mode="all"):
|
|
1072
1147
|
"""
|
|
1073
1148
|
Set strategy json configuration.
|
|
@@ -1091,6 +1166,7 @@ def _set_ops_strategy_json_config(type="SAVE", path="", mode="all"):
|
|
|
1091
1166
|
else:
|
|
1092
1167
|
raise KeyError("Type must be 'SAVE' or 'LOAD' and mode must be 'all' or 'principal'")
|
|
1093
1168
|
|
|
1169
|
+
|
|
1094
1170
|
_AUTO_PARALLEL_CONTEXT = None
|
|
1095
1171
|
|
|
1096
1172
|
|
|
@@ -1126,6 +1202,7 @@ _set_auto_parallel_context_func_map = {
|
|
|
1126
1202
|
"dataset_strategy": auto_parallel_context().set_dataset_strategy,
|
|
1127
1203
|
"enable_parallel_optimizer": auto_parallel_context().set_enable_parallel_optimizer,
|
|
1128
1204
|
"parallel_optimizer_config": auto_parallel_context().set_parallel_optimizer_config,
|
|
1205
|
+
"pipeline_config": auto_parallel_context().set_pipeline_config,
|
|
1129
1206
|
"grad_accumulation_step": auto_parallel_context().set_grad_accumulation_step,
|
|
1130
1207
|
"all_reduce_fusion_config": auto_parallel_context().set_all_reduce_fusion_split_indices,
|
|
1131
1208
|
"communi_parallel_mode": auto_parallel_context().set_communi_parallel_mode,
|
|
@@ -1143,6 +1220,8 @@ _get_auto_parallel_context_func_map = {
|
|
|
1143
1220
|
"gradient_fp32_sync": auto_parallel_context().get_gradient_fp32_sync,
|
|
1144
1221
|
"loss_repeated_mean": auto_parallel_context().get_loss_repeated_mean,
|
|
1145
1222
|
"pipeline_stages": auto_parallel_context().get_pipeline_stages,
|
|
1223
|
+
"pipeline_interleave": auto_parallel_context().get_pipeline_interleave,
|
|
1224
|
+
"pipeline_scheduler": auto_parallel_context().get_pipeline_scheduler,
|
|
1146
1225
|
"parallel_mode": auto_parallel_context().get_parallel_mode,
|
|
1147
1226
|
"search_mode": auto_parallel_context().get_strategy_search_mode,
|
|
1148
1227
|
"auto_parallel_search_mode": auto_parallel_context().get_auto_parallel_search_mode,
|
|
@@ -243,6 +243,33 @@ def _extract_pipeline_stage_num(strategy_file):
|
|
|
243
243
|
return pipeline_stage_num
|
|
244
244
|
|
|
245
245
|
|
|
246
|
+
def _extract_src_dst_layout_map_by_src(src_strategy_file=None, dst_strategy_file=None):
|
|
247
|
+
"""Extract strategy list by src strategy"""
|
|
248
|
+
src_layout_map = _extract_layout_map(src_strategy_file)
|
|
249
|
+
dst_layout_map = _extract_layout_map(dst_strategy_file)
|
|
250
|
+
if dst_layout_map is None:
|
|
251
|
+
return src_layout_map, dst_layout_map
|
|
252
|
+
for param_name in list(dst_layout_map.keys()):
|
|
253
|
+
if param_name in src_layout_map.keys():
|
|
254
|
+
continue
|
|
255
|
+
dst_layout_map.pop(param_name)
|
|
256
|
+
stage_id = 0
|
|
257
|
+
if src_strategy_file[-5:] == ".json":
|
|
258
|
+
with open(src_strategy_file, 'r') as f:
|
|
259
|
+
json_content = json.load(f)
|
|
260
|
+
strategy_items = json_content.get("parallel_strategy_item")
|
|
261
|
+
if not strategy_items:
|
|
262
|
+
raise ValueError("The strategy file {} if empty.".format(src_strategy_file))
|
|
263
|
+
stage_id = strategy_items.get(list(strategy_items.keys())[0]).get('stage')
|
|
264
|
+
else:
|
|
265
|
+
src_parallel_strategy_map = _load_protobuf_strategy(src_strategy_file)
|
|
266
|
+
strategy_items = src_parallel_strategy_map.parallel_strategy_item
|
|
267
|
+
if not strategy_items:
|
|
268
|
+
raise ValueError("The strategy file {} if empty.".format(src_strategy_file))
|
|
269
|
+
stage_id = strategy_items[0].parallel_strategys.stage
|
|
270
|
+
return src_layout_map, dst_layout_map, stage_id
|
|
271
|
+
|
|
272
|
+
|
|
246
273
|
def _extract_src_dst_layout_map(rank_id, src_strategy_file=None, dst_strategy_file=None):
|
|
247
274
|
"""Extract strategy list"""
|
|
248
275
|
src_layout_map = _extract_layout_map(src_strategy_file)
|
|
@@ -341,6 +368,7 @@ def _transform_parallel_checkpoint(rank_id, param_total_dict, param_attr_dict, s
|
|
|
341
368
|
Transform model parallel dimension for distributed checkpoint files.
|
|
342
369
|
"""
|
|
343
370
|
transform_param_dict = {}
|
|
371
|
+
device_num = -1
|
|
344
372
|
for param_name, _ in param_total_dict.items():
|
|
345
373
|
tensor_shape = list(param_total_dict[param_name].values())[0].shape
|
|
346
374
|
from_dev_matrix = [1]
|
|
@@ -394,14 +422,18 @@ def _transform_parallel_checkpoint(rank_id, param_total_dict, param_attr_dict, s
|
|
|
394
422
|
to_info_tuple = (to_opt_shard_size, to_dev_matrix_origin, to_tensor_map_origin, origin_tensor_shape)
|
|
395
423
|
_insert_opt_shard_reshape(param_rank_map, from_info_tuple, to_info_tuple)
|
|
396
424
|
transform_operator_stack = _generate_transform_operator_stack(param_rank_map, rank_id)
|
|
397
|
-
|
|
398
|
-
|
|
425
|
+
param_total_dict_copy = param_total_dict[param_name].copy()
|
|
426
|
+
_apply_tensor_transform_operators(transform_operator_stack, param_total_dict_copy, device_num)
|
|
427
|
+
transform_tensor = ms.Tensor(param_total_dict_copy[rank_id % device_num])
|
|
399
428
|
requires_grad = param_attr_dict[param_name][rank_id % device_num][0]
|
|
400
429
|
layerwise_parallel = param_attr_dict[param_name][rank_id % device_num][1]
|
|
401
430
|
transform_para = ms.Parameter(transform_tensor, param_name, requires_grad, layerwise_parallel)
|
|
402
431
|
if param_type_dict[param_name][rank_id % device_num] == "BFloat16":
|
|
403
432
|
transform_para.set_dtype(ms.bfloat16)
|
|
404
433
|
transform_param_dict[param_name] = transform_para
|
|
434
|
+
if device_num < 0:
|
|
435
|
+
raise ValueError("None of the parameters in checkpoint file are in either src strategy or "
|
|
436
|
+
"dst strategy. Please check correctness of strategy files.")
|
|
405
437
|
|
|
406
438
|
# Handle those parameter like learning_rate, global_step which not in strategy_file.
|
|
407
439
|
for param_name, _ in param_total_dict.items():
|
mindspore/parallel/_tensor.py
CHANGED
|
@@ -223,7 +223,9 @@ def _load_tensor(tensor, dev_mat, tensor_map, rank_id=-1):
|
|
|
223
223
|
tensor_strategy = _get_tensor_strategy(dev_mat, tensor_map)
|
|
224
224
|
tensor_slice_index = _get_tensor_slice_index(dev_mat, tensor_strategy, tensor_map, rank)
|
|
225
225
|
if tensor.dtype == mstype.bfloat16:
|
|
226
|
-
|
|
226
|
+
from mindspore.ops.operations import Cast
|
|
227
|
+
cpu_cast = Cast().set_device("CPU")
|
|
228
|
+
tensor = cpu_cast(tensor, mstype.float32)
|
|
227
229
|
np_tensor = tensor.asnumpy()
|
|
228
230
|
np_tensor_list = _chunk_tensor_by_strategy(np_tensor, tensor_strategy)
|
|
229
231
|
np_tensor_slice = np_tensor_list[int(tensor_slice_index)]
|
|
@@ -805,14 +805,14 @@ class MultiHeadAttention(Cell):
|
|
|
805
805
|
- **attention_mask** (Tensor) - If the use_past is False or is_first_iteration=True, the attention mask
|
|
806
806
|
matrix should ba (batch_size, src_seq_length, tgt_seq_length), or None. None means there will be no mask
|
|
807
807
|
in softmax computation. Otherwise, the mask must be (batch_size, 1, tgt_seq_length)
|
|
808
|
-
- **key_past** (Tensor) -
|
|
808
|
+
- **key_past** (Tensor) - float16 tensor with shape (batch_size, num_heads, size_per_head, tgt_seq_length).
|
|
809
809
|
The past calculated key vector. Used for incremental prediction when the use_past is True.
|
|
810
810
|
Default None.
|
|
811
|
-
- **value_past** (Tensor) -
|
|
811
|
+
- **value_past** (Tensor) - float16 tensor with shape
|
|
812
812
|
(batch_size, num_heads, tgt_seq_length, size_per_head).
|
|
813
813
|
The past calculated value vector. Used for incremental prediction when the use_past is True.
|
|
814
814
|
Default None.
|
|
815
|
-
- **batch_valid_length** (Tensor) -
|
|
815
|
+
- **batch_valid_length** (Tensor) - int32 tensor with shape (batch_size,) the past calculated the index.
|
|
816
816
|
Used for incremental prediction when the use_past is True. Default None.
|
|
817
817
|
|
|
818
818
|
Outputs:
|
|
@@ -1412,7 +1412,7 @@ class TransformerEncoderLayer(Cell):
|
|
|
1412
1412
|
be no mask in softmax computation. Otherwise, should be [batch_size, 1, hidden_size]
|
|
1413
1413
|
- **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
|
|
1414
1414
|
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
|
|
1415
|
-
- **batch_valid_length** (Tensor) -
|
|
1415
|
+
- **batch_valid_length** (Tensor) - int32 tensor with shape [batch_size] the past calculated the index.
|
|
1416
1416
|
Used for incremental prediction when the use_past is True. Default None.
|
|
1417
1417
|
|
|
1418
1418
|
Outputs:
|
|
@@ -1824,7 +1824,7 @@ class TransformerDecoderLayer(Cell):
|
|
|
1824
1824
|
means there will be no mask in softmax computation in cross attention. Default None.
|
|
1825
1825
|
- **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
|
|
1826
1826
|
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
|
|
1827
|
-
- **batch_valid_length** (Tensor) -
|
|
1827
|
+
- **batch_valid_length** (Tensor) - int32 tensor with shape [batch_size] the past calculated the index.
|
|
1828
1828
|
Used for incremental prediction when the use_past is True. Default None.
|
|
1829
1829
|
|
|
1830
1830
|
Outputs:
|
|
@@ -2333,7 +2333,7 @@ class TransformerEncoder(Cell):
|
|
|
2333
2333
|
be no mask in softmax computation. Otherwise, should be [batch_size, 1, hidden_size]
|
|
2334
2334
|
- **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
|
|
2335
2335
|
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
|
|
2336
|
-
- **batch_valid_length** (Tensor) -
|
|
2336
|
+
- **batch_valid_length** (Tensor) - int32 tensor with shape [batch_size] the past calculated the index.
|
|
2337
2337
|
Used for incremental prediction when the use_past is True. Default None.
|
|
2338
2338
|
|
|
2339
2339
|
Outputs:
|
|
@@ -2589,7 +2589,7 @@ class TransformerDecoder(Cell):
|
|
|
2589
2589
|
means there will be no mask in softmax computation in cross attention. Default None.
|
|
2590
2590
|
- **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
|
|
2591
2591
|
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
|
|
2592
|
-
- **batch_valid_length** (Tensor) -
|
|
2592
|
+
- **batch_valid_length** (Tensor) - int32 tensor with shape [batch_size] the past calculated the index.
|
|
2593
2593
|
Used for incremental prediction when the use_past is True. Default None.
|
|
2594
2594
|
|
|
2595
2595
|
Outputs:
|
|
@@ -2842,7 +2842,7 @@ class Transformer(Cell):
|
|
|
2842
2842
|
seq_length, hidden_size], this should be none if the decoder layer is 0 or the user wants no mask.
|
|
2843
2843
|
- **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
|
|
2844
2844
|
past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
|
|
2845
|
-
- **batch_valid_length** (Tensor) -
|
|
2845
|
+
- **batch_valid_length** (Tensor) - int32 tensor with shape [batch_size] the past calculated the index.
|
|
2846
2846
|
Used for incremental prediction when the use_past is True. Default None.
|
|
2847
2847
|
|
|
2848
2848
|
Outputs:
|
|
@@ -25,11 +25,11 @@ from mindspore.common import dtype as mstype
|
|
|
25
25
|
from mindspore.parallel._parallel_serialization import _rank_list_for_transform_parallel_checkpoint, \
|
|
26
26
|
_transform_parallel_checkpoint, _get_device_num_from_strategy, _make_dir, \
|
|
27
27
|
_extract_layout_map, _extract_src_dst_layout_map, _parameter_not_in_local_stage, _extract_pipeline_stage_num, \
|
|
28
|
-
_merge_protobuf_strategy, _merge_json_strategy
|
|
28
|
+
_merge_protobuf_strategy, _merge_json_strategy, _extract_src_dst_layout_map_by_src
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
__all__ = ["merge_pipeline_strategys", "rank_list_for_transform", "transform_checkpoint_by_rank",
|
|
32
|
-
"transform_checkpoints"]
|
|
32
|
+
"transform_checkpoints", "load_segmented_checkpoints"]
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
|
|
@@ -71,7 +71,6 @@ def merge_pipeline_strategys(src_strategy_dirs, dst_strategy_file):
|
|
|
71
71
|
_merge_json_strategy(src_strategy_files_json, dst_strategy_file)
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
|
|
75
74
|
def rank_list_for_transform(rank_id, src_strategy_file=None, dst_strategy_file=None):
|
|
76
75
|
"""
|
|
77
76
|
List of original distributed checkpoint rank index for obtaining the target checkpoint of a rank_id
|
|
@@ -222,48 +221,63 @@ def transform_checkpoint_by_rank(rank_id, checkpoint_files_map, save_checkpoint_
|
|
|
222
221
|
ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
|
|
223
222
|
|
|
224
223
|
|
|
225
|
-
def
|
|
226
|
-
|
|
227
|
-
"""
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
224
|
+
def _transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file,
|
|
225
|
+
dst_strategy_file=None):
|
|
226
|
+
"""Transform checkpoint for stage in src_strategy_file"""
|
|
227
|
+
param_total_dict = defaultdict(dict)
|
|
228
|
+
param_attr_dict = defaultdict(dict)
|
|
229
|
+
param_type_dict = defaultdict(dict)
|
|
230
|
+
src_strategy_list, dst_strategy_list, stage_id = _extract_src_dst_layout_map_by_src(src_strategy_file, \
|
|
231
|
+
dst_strategy_file)
|
|
232
|
+
src_stage_device_num = np.prod(src_strategy_list.get(list(src_strategy_list.keys())[0])[0]) if src_strategy_list \
|
|
233
|
+
is not None else 1
|
|
234
|
+
dst_stage_device_num = np.prod(dst_strategy_list.get(list(dst_strategy_list.keys())[0])[0]) if dst_strategy_list \
|
|
235
|
+
is not None else 1
|
|
236
|
+
origin_dst_strategy_list = _extract_layout_map(dst_strategy_file)
|
|
237
|
+
origin_src_strategy_list = _extract_layout_map(src_strategy_file)
|
|
238
|
+
checkpoint_files_map = {}
|
|
239
|
+
src_rank_id_start = stage_id * src_stage_device_num
|
|
240
|
+
for local_rank in range(src_stage_device_num):
|
|
241
|
+
rank_id = src_rank_id_start + local_rank
|
|
242
|
+
checkpoint_file_name = os.path.join(src_checkpoints_dir, "rank_{}".format(rank_id), "*.ckpt")
|
|
243
|
+
rank_ckpts = glob.glob(checkpoint_file_name)
|
|
244
|
+
rank_ckpts.sort()
|
|
245
|
+
for checkpoint_file in rank_ckpts:
|
|
246
|
+
if not os.path.isfile(checkpoint_file):
|
|
247
|
+
ms.log.warning("{} is not a checkpoint file.".format(checkpoint_file))
|
|
248
|
+
continue
|
|
249
|
+
checkpoint_files_map[rank_id] = checkpoint_file
|
|
250
|
+
for rank, local_file in checkpoint_files_map.items():
|
|
251
|
+
if not os.path.exists(local_file):
|
|
252
|
+
raise ValueError("Checkpoint file {} in rank {} not exits: ".format(local_file, rank))
|
|
253
|
+
for rank, file_name in checkpoint_files_map.items():
|
|
254
|
+
ckpt_dict = ms.load_checkpoint(file_name)
|
|
255
|
+
for param_name, param in ckpt_dict.items():
|
|
256
|
+
# cut the parameter not in the pipeline stage.
|
|
257
|
+
if _parameter_not_in_local_stage(param_name, origin_src_strategy_list, src_strategy_list) \
|
|
258
|
+
and _parameter_not_in_local_stage(param_name, origin_dst_strategy_list, dst_strategy_list):
|
|
259
|
+
continue
|
|
260
|
+
src_rank = rank % src_stage_device_num
|
|
261
|
+
param_type_dict[param_name][src_rank] = str(param.data.dtype)
|
|
262
|
+
if param.data.dtype == mstype.bfloat16:
|
|
263
|
+
param.set_dtype(mstype.float32)
|
|
264
|
+
param_total_dict[param_name][src_rank] = param.data.asnumpy()
|
|
265
|
+
param_attr_dict[param_name][src_rank] = (param.requires_grad, param.layerwise_parallel)
|
|
266
|
+
for local_rank_id in range(dst_stage_device_num):
|
|
267
|
+
transform_param_list = _transform_parallel_checkpoint(local_rank_id, param_total_dict,
|
|
268
|
+
param_attr_dict, src_strategy_list, dst_strategy_list,
|
|
269
|
+
param_type_dict)
|
|
270
|
+
save_checkpoint_file = "{}{}_part{}.ckpt".format(ckpt_prefix, local_rank_id, stage_id)
|
|
271
|
+
save_checkpoint_file_dir = os.path.join(dst_checkpoints_dir, "rank_{}".format(local_rank_id))
|
|
272
|
+
if not os.path.exists(save_checkpoint_file_dir):
|
|
273
|
+
_make_dir(save_checkpoint_file_dir, "path")
|
|
274
|
+
save_checkpoint_file_name = os.path.join(save_checkpoint_file_dir, save_checkpoint_file)
|
|
275
|
+
ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
|
|
279
|
+
dst_strategy_file=None):
|
|
280
|
+
"""Transform checkpoints for all stages in src_strategy_file"""
|
|
267
281
|
checkpoints_rank_dir_list = os.path.join(src_checkpoints_dir, "rank_[0-9]*")
|
|
268
282
|
all_checkpoint_files_map = {}
|
|
269
283
|
for checkpoint_dir in glob.glob(checkpoints_rank_dir_list):
|
|
@@ -336,3 +350,135 @@ def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
|
|
|
336
350
|
ms.save_checkpoint(transform_param_list, save_checkpoint_file_name)
|
|
337
351
|
del param_total_dict_copy
|
|
338
352
|
del param_total_dict
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix, src_strategy_file=None,
|
|
356
|
+
dst_strategy_file=None):
|
|
357
|
+
"""
|
|
358
|
+
Transform distributed checkpoint from source sharding strategy to destination sharding strategy for a rank.
|
|
359
|
+
For more details about converting distributed Checkpoint, please refer to
|
|
360
|
+
`Model Transformation <https://www.mindspore.cn/tutorials/experts/en/r2.2/parallel/model_transformation.html>`_.
|
|
361
|
+
|
|
362
|
+
Note:
|
|
363
|
+
The `src_checkpoints_dir` directory structure should be organized like "src_checkpoints_dir/rank_0/a.ckpt", the
|
|
364
|
+
rank number should be set to a subdirectory and the checkpoint file is stored in this subdirectory. If multiple
|
|
365
|
+
files exist in a rank directory, the last file in the lexicgraphic order would be selected.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
src_checkpoints_dir (str): The source checkpoints directory.
|
|
369
|
+
dst_checkpoints_dir (str): The destination checkpoints directory to save the converted checkpoints.
|
|
370
|
+
ckpt_prefix (str): The destination checkpoint name prefix.
|
|
371
|
+
src_strategy_file (str): Name of source sharding strategy file which saved by
|
|
372
|
+
'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
|
|
373
|
+
when the 'src_strategy_file' is None, it means that the source sharding strategy is
|
|
374
|
+
without any sharing for each parameter. Default:None.
|
|
375
|
+
dst_strategy_file (str): Name of destination sharding strategy file which saved by
|
|
376
|
+
'mindspore.set_auto_parallel_context(strategy_ckpt_save_file)'.
|
|
377
|
+
when the 'dst_strategy_file' is None, it means that the destination sharding strategy
|
|
378
|
+
is without any sharing for each parameter. Default:None.
|
|
379
|
+
|
|
380
|
+
Raises:
|
|
381
|
+
ValueError: `src_strategy_file` or `dst_strategy_file` is incorrect.
|
|
382
|
+
NotADirectoryError: `src_checkpoints_dir` or `dst_checkpoints_dir` is not a directory.
|
|
383
|
+
ValueError: The checkpoint file is missing in `src_checkpoints_dir`.
|
|
384
|
+
TypeError: `src_strategy_file` or `dst_strategy_file` is not a string.
|
|
385
|
+
|
|
386
|
+
Examples:
|
|
387
|
+
>>> import mindspore as ms
|
|
388
|
+
>>> ms.transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, "dst_checkpoint",
|
|
389
|
+
... "./src_strategy.ckpt", "./dst_strategy.ckpt")
|
|
390
|
+
|
|
391
|
+
"""
|
|
392
|
+
if not os.path.isdir(src_checkpoints_dir):
|
|
393
|
+
raise NotADirectoryError("src_checkpoints_dir {} is not a directory.".format(src_checkpoints_dir))
|
|
394
|
+
_make_dir(dst_checkpoints_dir, "path")
|
|
395
|
+
if not isinstance(ckpt_prefix, str):
|
|
396
|
+
raise TypeError("The ckpt_prefix should be a str.")
|
|
397
|
+
if src_strategy_file and os.path.dirname(src_strategy_file) and not os.path.exists(
|
|
398
|
+
os.path.dirname(src_strategy_file)):
|
|
399
|
+
raise ValueError("The director of src_strategy_file: {} is not exists.".
|
|
400
|
+
format(os.path.dirname(src_strategy_file)))
|
|
401
|
+
if dst_strategy_file and os.path.dirname(dst_strategy_file) and not os.path.exists(
|
|
402
|
+
os.path.dirname(dst_strategy_file)):
|
|
403
|
+
raise ValueError("The director of dst_strategy_file: {} is not exists.".
|
|
404
|
+
format(os.path.dirname(dst_strategy_file)))
|
|
405
|
+
src_layout_map = _extract_layout_map(src_strategy_file)
|
|
406
|
+
dst_layout_map = _extract_layout_map(dst_strategy_file)
|
|
407
|
+
pipeline_stage_num = _extract_pipeline_stage_num(src_strategy_file)
|
|
408
|
+
if src_layout_map:
|
|
409
|
+
src_param_keys = {param_name for param_name in src_layout_map if not param_name.startswith("accu_grads")}
|
|
410
|
+
if dst_layout_map:
|
|
411
|
+
dst_param_keys = {param_name for param_name in dst_layout_map if not param_name.startswith("accu_grads")}
|
|
412
|
+
if src_layout_map and dst_layout_map and pipeline_stage_num == 1 \
|
|
413
|
+
and src_param_keys.issubset(dst_param_keys) and len(src_param_keys) < len(dst_param_keys):
|
|
414
|
+
dst_stage_num = _extract_pipeline_stage_num(dst_strategy_file)
|
|
415
|
+
if dst_stage_num > 1:
|
|
416
|
+
raise NotImplementedError("When using unmerged src strategy, dst strategy doesn't \
|
|
417
|
+
support strategy with pipeline parallel.")
|
|
418
|
+
_transform_checkpoint_by_stage(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
|
|
419
|
+
src_strategy_file, dst_strategy_file)
|
|
420
|
+
else:
|
|
421
|
+
_transform_checkpoints(src_checkpoints_dir, dst_checkpoints_dir, ckpt_prefix,
|
|
422
|
+
src_strategy_file, dst_strategy_file)
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def load_segmented_checkpoints(ckpt_file_dir, net=None, strict_load=False, filter_prefix=None,
|
|
426
|
+
dec_key=None, dec_mode="AES-GCM", specify_prefix=None, choice_func=None):
|
|
427
|
+
"""
|
|
428
|
+
Load checkpoint info from a specified file. If the specified ckpt_file_dir path contains multiple
|
|
429
|
+
checkpoint files, all checkpoint files will be loaded one by one and the combined dictionary will be return.
|
|
430
|
+
|
|
431
|
+
Note:
|
|
432
|
+
- `specify_prefix` and `filter_prefix` do not affect each other.
|
|
433
|
+
- If none of the parameters are loaded from checkpoint file, it will throw ValueError.
|
|
434
|
+
- `specify_prefix` and `filter_prefix` are in the process of being deprecated,
|
|
435
|
+
`choice_func` is recommended instead.
|
|
436
|
+
And using either of those two args will override `choice_func` at the same time.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
ckpt_file_dir (str): Checkpoint file directory.
|
|
440
|
+
net (Cell): The network where the parameters will be loaded. Default: ``None`` .
|
|
441
|
+
strict_load (bool): Whether to strict load the parameter into net. If ``False`` , it will load parameter
|
|
442
|
+
into net when parameter name's suffix in checkpoint file is the same as the
|
|
443
|
+
parameter in the network. When the types are inconsistent perform type conversion
|
|
444
|
+
on the parameters of the same type, such as float32 to float16. Default: ``False`` .
|
|
445
|
+
filter_prefix (Union[str, list[str], tuple[str]]): Deprecated(see `choice_func`). Parameters starting with the
|
|
446
|
+
filter_prefix will not be loaded. Default: ``None`` .
|
|
447
|
+
dec_key (Union[None, bytes]): Byte type key used for decryption. If the value is ``None`` , the decryption
|
|
448
|
+
is not required. Default: ``None`` .
|
|
449
|
+
dec_mode (str): This parameter is valid only when dec_key is not set to ``None`` . Specifies the decryption
|
|
450
|
+
mode, currently supports ``"AES-GCM"`` and ``"AES-CBC"`` and ``"SM4-CBC"`` .
|
|
451
|
+
Default: ``"AES-GCM"`` .
|
|
452
|
+
specify_prefix (Union[str, list[str], tuple[str]]): Deprecated(see `choice_func`). Parameters starting with the
|
|
453
|
+
specify_prefix will be loaded. Default: ``None`` .
|
|
454
|
+
choice_func (Union[None, function]) : Input value of the function is a Parameter name of type string,
|
|
455
|
+
and the return value is a bool. If returns ``True`` , the Parameter
|
|
456
|
+
that matches the custom condition will be loaded. If returns ``False`` , the Parameter that
|
|
457
|
+
matches the custom condition will be removed. Default: ``None`` .
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
Dict, key is parameter name, value is a Parameter or string. When the `append_dict` parameter of
|
|
461
|
+
:func:`mindspore.save_checkpoint` and the `append_info` parameter of :class:`mindspore.train.CheckpointConfig`
|
|
462
|
+
are used to save the checkpoint, `append_dict` and `append_info` are dict types, and their value are string,
|
|
463
|
+
then the return value obtained by loading checkpoint is string, and in other cases the return value is
|
|
464
|
+
Parameter.
|
|
465
|
+
|
|
466
|
+
Raises:
|
|
467
|
+
TypeError: Input ckpt_file_dir is not a string.
|
|
468
|
+
ValueError: Checkpoint file directory doesn't exist. Or it's not a directory
|
|
469
|
+
ValueError: Checkpoint file's format is incorrect.
|
|
470
|
+
ValueError: Parameter's dict is None after load checkpoint file.
|
|
471
|
+
TypeError: The type of `specify_prefix` or `filter_prefix` is incorrect.
|
|
472
|
+
"""
|
|
473
|
+
if not isinstance(ckpt_file_dir, str):
|
|
474
|
+
raise TypeError("The ckpt_file_dir should be a str.")
|
|
475
|
+
if not os.path.isdir(ckpt_file_dir):
|
|
476
|
+
raise ValueError("The dst_strategy_file: {} doesn't exists. Or it's not a directory.".
|
|
477
|
+
format(os.path.dirname(ckpt_file_dir)))
|
|
478
|
+
checkpoint_file_name = os.path.join(ckpt_file_dir, "*.ckpt")
|
|
479
|
+
rank_ckpts = glob.glob(checkpoint_file_name)
|
|
480
|
+
parameter_dict = {}
|
|
481
|
+
for checkpoint_file in rank_ckpts:
|
|
482
|
+
parameter_dict.update(ms.load_checkpoint(checkpoint_file, net, strict_load, filter_prefix, dec_key,
|
|
483
|
+
dec_mode, specify_prefix, choice_func))
|
|
484
|
+
return parameter_dict
|
mindspore/pgodb140.dll
CHANGED
|
Binary file
|
mindspore/pgort140.dll
CHANGED
|
Binary file
|