PyPI - mindspore - Versions diffs - 2.3.0__cp39-cp39-win_amd64.whl → 2.4.1__cp39-cp39-win_amd64.whl - Mend

mindspore 2.3.0__cp39-cp39-win_amd64.whl → 2.4.1__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (287) hide show

mindspore/.commit_id +1 -1
mindspore/__init__.py +3 -1
mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
mindspore/_checkparam.py +50 -9
mindspore/_extends/parse/compile_config.py +41 -0
mindspore/_extends/parse/parser.py +9 -7
mindspore/_extends/parse/standard_method.py +52 -14
mindspore/_extends/pijit/pijit_func_white_list.py +350 -24
mindspore/amp.py +24 -10
mindspore/avcodec-59.dll +0 -0
mindspore/avdevice-59.dll +0 -0
mindspore/avfilter-8.dll +0 -0
mindspore/avformat-59.dll +0 -0
mindspore/avutil-57.dll +0 -0
mindspore/common/__init__.py +6 -4
mindspore/common/_pijit_context.py +190 -0
mindspore/common/_register_for_tensor.py +2 -1
mindspore/common/_tensor_overload.py +139 -0
mindspore/common/api.py +102 -87
mindspore/common/dump.py +5 -6
mindspore/common/generator.py +1 -7
mindspore/common/hook_handle.py +14 -26
mindspore/common/initializer.py +51 -15
mindspore/common/mindir_util.py +2 -2
mindspore/common/parameter.py +62 -15
mindspore/common/recompute.py +39 -9
mindspore/common/sparse_tensor.py +7 -3
mindspore/common/tensor.py +183 -37
mindspore/communication/__init__.py +1 -1
mindspore/communication/_comm_helper.py +38 -3
mindspore/communication/comm_func.py +315 -60
mindspore/communication/management.py +14 -14
mindspore/context.py +132 -22
mindspore/dataset/__init__.py +1 -1
mindspore/dataset/audio/__init__.py +1 -1
mindspore/dataset/core/config.py +7 -0
mindspore/dataset/core/validator_helpers.py +7 -0
mindspore/dataset/engine/cache_client.py +1 -1
mindspore/dataset/engine/datasets.py +72 -44
mindspore/dataset/engine/datasets_audio.py +7 -7
mindspore/dataset/engine/datasets_standard_format.py +53 -3
mindspore/dataset/engine/datasets_text.py +20 -20
mindspore/dataset/engine/datasets_user_defined.py +174 -104
mindspore/dataset/engine/datasets_vision.py +33 -33
mindspore/dataset/engine/iterators.py +29 -0
mindspore/dataset/engine/obs/util.py +7 -0
mindspore/dataset/engine/queue.py +114 -60
mindspore/dataset/engine/serializer_deserializer.py +2 -2
mindspore/dataset/engine/validators.py +34 -14
mindspore/dataset/text/__init__.py +1 -4
mindspore/dataset/transforms/__init__.py +0 -3
mindspore/dataset/utils/line_reader.py +2 -0
mindspore/dataset/vision/__init__.py +1 -4
mindspore/dataset/vision/utils.py +1 -1
mindspore/dataset/vision/validators.py +2 -1
mindspore/dnnl.dll +0 -0
mindspore/{nn/extend → experimental/es}/__init__.py +4 -11
mindspore/experimental/es/embedding_service.py +883 -0
mindspore/{nn/layer → experimental/es}/embedding_service_layer.py +218 -30
mindspore/experimental/llm_boost/__init__.py +21 -0
mindspore/{nn/extend/layer → experimental/llm_boost/atb}/__init__.py +4 -8
mindspore/experimental/llm_boost/atb/boost_base.py +211 -0
mindspore/experimental/llm_boost/atb/llama_boost.py +115 -0
mindspore/experimental/llm_boost/atb/qwen_boost.py +101 -0
mindspore/experimental/llm_boost/register.py +129 -0
mindspore/experimental/llm_boost/utils.py +31 -0
mindspore/experimental/optim/adamw.py +85 -0
mindspore/experimental/optim/optimizer.py +3 -0
mindspore/hal/__init__.py +3 -3
mindspore/hal/contiguous_tensors_handle.py +175 -0
mindspore/hal/stream.py +18 -0
mindspore/include/api/model_group.h +13 -1
mindspore/include/api/types.h +10 -10
mindspore/include/dataset/config.h +2 -2
mindspore/include/dataset/constants.h +2 -2
mindspore/include/dataset/execute.h +2 -2
mindspore/include/dataset/vision.h +4 -0
mindspore/jpeg62.dll +0 -0
mindspore/log.py +1 -1
mindspore/mindrecord/filewriter.py +68 -51
mindspore/mindspore_backend.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_glog.dll +0 -0
mindspore/mindspore_np_dtype.dll +0 -0
mindspore/mindspore_ops.dll +0 -0
mindspore/mint/__init__.py +983 -46
mindspore/mint/distributed/__init__.py +31 -0
mindspore/mint/distributed/distributed.py +254 -0
mindspore/mint/nn/__init__.py +268 -23
mindspore/mint/nn/functional.py +125 -19
mindspore/mint/nn/layer/__init__.py +39 -0
mindspore/mint/nn/layer/activation.py +133 -0
mindspore/mint/nn/layer/normalization.py +477 -0
mindspore/mint/nn/layer/pooling.py +110 -0
mindspore/mint/optim/adamw.py +26 -13
mindspore/mint/special/__init__.py +63 -0
mindspore/multiprocessing/__init__.py +2 -1
mindspore/nn/__init__.py +0 -1
mindspore/nn/cell.py +276 -96
mindspore/nn/layer/activation.py +211 -44
mindspore/nn/layer/basic.py +137 -10
mindspore/nn/layer/embedding.py +137 -2
mindspore/nn/layer/normalization.py +101 -5
mindspore/nn/layer/padding.py +34 -48
mindspore/nn/layer/pooling.py +161 -7
mindspore/nn/layer/transformer.py +3 -3
mindspore/nn/loss/__init__.py +2 -2
mindspore/nn/loss/loss.py +84 -6
mindspore/nn/optim/__init__.py +2 -1
mindspore/nn/optim/adadelta.py +1 -1
mindspore/nn/optim/adam.py +1 -1
mindspore/nn/optim/lamb.py +1 -1
mindspore/nn/optim/tft_wrapper.py +124 -0
mindspore/nn/wrap/cell_wrapper.py +12 -23
mindspore/nn/wrap/grad_reducer.py +5 -5
mindspore/nn/wrap/loss_scale.py +17 -3
mindspore/numpy/__init__.py +1 -1
mindspore/numpy/array_creations.py +65 -68
mindspore/numpy/array_ops.py +64 -60
mindspore/numpy/fft.py +610 -75
mindspore/numpy/logic_ops.py +11 -10
mindspore/numpy/math_ops.py +85 -84
mindspore/numpy/utils_const.py +4 -4
mindspore/opencv_core452.dll +0 -0
mindspore/opencv_imgcodecs452.dll +0 -0
mindspore/opencv_imgproc452.dll +0 -0
mindspore/ops/__init__.py +6 -4
mindspore/ops/_grad_experimental/grad_array_ops.py +0 -11
mindspore/ops/_grad_experimental/grad_comm_ops.py +67 -4
mindspore/ops/_grad_experimental/grad_math_ops.py +0 -22
mindspore/ops/_vmap/vmap_array_ops.py +2 -4
mindspore/ops/_vmap/vmap_math_ops.py +17 -1
mindspore/ops/_vmap/vmap_nn_ops.py +43 -2
mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +91 -7
mindspore/ops/auto_generate/gen_arg_dtype_cast.py +2 -0
mindspore/ops/auto_generate/gen_extend_func.py +767 -13
mindspore/ops/auto_generate/gen_ops_def.py +2452 -364
mindspore/ops/auto_generate/gen_ops_prim.py +5442 -1756
mindspore/ops/auto_generate/pyboost_inner_prim.py +176 -56
mindspore/ops/composite/base.py +85 -48
mindspore/ops/composite/multitype_ops/_compile_utils.py +1 -0
mindspore/ops/composite/multitype_ops/not_in_impl.py +2 -2
mindspore/ops/function/__init__.py +22 -0
mindspore/ops/function/array_func.py +492 -153
mindspore/ops/function/debug_func.py +113 -1
mindspore/ops/function/fft_func.py +15 -2
mindspore/ops/function/grad/grad_func.py +3 -2
mindspore/ops/function/math_func.py +564 -207
mindspore/ops/function/nn_func.py +817 -383
mindspore/ops/function/other_func.py +3 -2
mindspore/ops/function/random_func.py +402 -12
mindspore/ops/function/reshard_func.py +13 -11
mindspore/ops/function/sparse_unary_func.py +1 -1
mindspore/ops/function/vmap_func.py +3 -2
mindspore/ops/functional.py +24 -14
mindspore/ops/op_info_register.py +3 -3
mindspore/ops/operations/__init__.py +7 -2
mindspore/ops/operations/_grad_ops.py +2 -76
mindspore/ops/operations/_infer_ops.py +1 -1
mindspore/ops/operations/_inner_ops.py +71 -94
mindspore/ops/operations/array_ops.py +14 -146
mindspore/ops/operations/comm_ops.py +63 -53
mindspore/ops/operations/custom_ops.py +83 -19
mindspore/ops/operations/debug_ops.py +42 -10
mindspore/ops/operations/manually_defined/_inner.py +12 -0
mindspore/ops/operations/manually_defined/ops_def.py +273 -20
mindspore/ops/operations/math_ops.py +12 -223
mindspore/ops/operations/nn_ops.py +20 -114
mindspore/ops/operations/other_ops.py +7 -4
mindspore/ops/operations/random_ops.py +46 -1
mindspore/ops/primitive.py +18 -6
mindspore/ops_generate/arg_dtype_cast.py +2 -0
mindspore/ops_generate/gen_aclnn_implement.py +11 -11
mindspore/ops_generate/gen_constants.py +36 -0
mindspore/ops_generate/gen_ops.py +67 -52
mindspore/ops_generate/gen_ops_inner_prim.py +1 -1
mindspore/ops_generate/gen_pyboost_func.py +131 -47
mindspore/ops_generate/op_proto.py +10 -3
mindspore/ops_generate/pyboost_utils.py +14 -1
mindspore/ops_generate/template.py +43 -21
mindspore/parallel/__init__.py +3 -1
mindspore/parallel/_auto_parallel_context.py +31 -9
mindspore/parallel/_cell_wrapper.py +85 -0
mindspore/parallel/_parallel_serialization.py +47 -19
mindspore/parallel/_tensor.py +127 -13
mindspore/parallel/_utils.py +53 -22
mindspore/parallel/algo_parameter_config.py +5 -5
mindspore/parallel/checkpoint_transform.py +46 -39
mindspore/parallel/cluster/process_entity/__init__.py +1 -1
mindspore/parallel/cluster/process_entity/_api.py +31 -23
mindspore/parallel/cluster/process_entity/_utils.py +2 -27
mindspore/parallel/parameter_broadcast.py +3 -4
mindspore/parallel/shard.py +162 -31
mindspore/parallel/transform_safetensors.py +1146 -0
mindspore/profiler/__init__.py +2 -1
mindspore/profiler/common/constant.py +29 -0
mindspore/profiler/common/registry.py +47 -0
mindspore/profiler/common/util.py +28 -0
mindspore/profiler/dynamic_profiler.py +694 -0
mindspore/profiler/envprofiling.py +17 -19
mindspore/profiler/parser/ascend_analysis/constant.py +18 -0
mindspore/profiler/parser/ascend_analysis/file_manager.py +25 -4
mindspore/profiler/parser/ascend_analysis/function_event.py +43 -19
mindspore/profiler/parser/ascend_analysis/fwk_cann_parser.py +31 -26
mindspore/profiler/parser/ascend_analysis/fwk_file_parser.py +56 -10
mindspore/profiler/parser/ascend_analysis/msprof_timeline_parser.py +55 -8
mindspore/profiler/parser/ascend_analysis/path_manager.py +313 -0
mindspore/profiler/parser/ascend_analysis/profiler_info_parser.py +27 -20
mindspore/profiler/parser/ascend_analysis/trace_event_manager.py +9 -2
mindspore/profiler/parser/ascend_msprof_exporter.py +5 -4
mindspore/profiler/parser/ascend_timeline_generator.py +27 -25
mindspore/profiler/parser/base_timeline_generator.py +19 -25
mindspore/profiler/parser/cpu_gpu_timeline_generator.py +25 -12
mindspore/profiler/parser/framework_parser.py +1 -391
mindspore/profiler/parser/gpu_analysis/__init__.py +14 -0
mindspore/profiler/parser/gpu_analysis/function_event.py +44 -0
mindspore/profiler/parser/gpu_analysis/fwk_file_parser.py +89 -0
mindspore/profiler/parser/gpu_analysis/profiler_info_parser.py +72 -0
mindspore/profiler/parser/memory_usage_parser.py +0 -154
mindspore/profiler/parser/profiler_info.py +78 -6
mindspore/profiler/profiler.py +153 -0
mindspore/profiler/profiling.py +285 -413
mindspore/rewrite/__init__.py +1 -2
mindspore/rewrite/common/namespace.py +4 -4
mindspore/rewrite/symbol_tree/symbol_tree.py +3 -3
mindspore/run_check/_check_version.py +39 -104
mindspore/safeguard/rewrite_obfuscation.py +591 -247
mindspore/swresample-4.dll +0 -0
mindspore/swscale-6.dll +0 -0
mindspore/tinyxml2.dll +0 -0
mindspore/train/__init__.py +4 -3
mindspore/train/_utils.py +105 -19
mindspore/train/amp.py +171 -53
mindspore/train/callback/__init__.py +2 -2
mindspore/train/callback/_callback.py +4 -4
mindspore/train/callback/_checkpoint.py +97 -31
mindspore/train/callback/_cluster_monitor.py +1 -1
mindspore/train/callback/_flops_collector.py +1 -0
mindspore/train/callback/_loss_monitor.py +3 -3
mindspore/train/callback/_on_request_exit.py +145 -31
mindspore/train/callback/_summary_collector.py +5 -5
mindspore/train/callback/_tft_register.py +375 -0
mindspore/train/dataset_helper.py +15 -3
mindspore/train/metrics/metric.py +3 -3
mindspore/train/metrics/roc.py +4 -4
mindspore/train/mind_ir_pb2.py +44 -39
mindspore/train/model.py +154 -58
mindspore/train/serialization.py +342 -128
mindspore/turbojpeg.dll +0 -0
mindspore/utils/__init__.py +21 -0
mindspore/utils/utils.py +60 -0
mindspore/version.py +1 -1
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/METADATA +13 -7
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/RECORD +260 -254
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/WHEEL +1 -1
mindspore/include/c_api/ms/abstract.h +0 -67
mindspore/include/c_api/ms/attribute.h +0 -197
mindspore/include/c_api/ms/base/handle_types.h +0 -43
mindspore/include/c_api/ms/base/macros.h +0 -32
mindspore/include/c_api/ms/base/status.h +0 -33
mindspore/include/c_api/ms/base/types.h +0 -283
mindspore/include/c_api/ms/context.h +0 -102
mindspore/include/c_api/ms/graph.h +0 -160
mindspore/include/c_api/ms/node.h +0 -606
mindspore/include/c_api/ms/tensor.h +0 -161
mindspore/include/c_api/ms/value.h +0 -84
mindspore/mindspore_shared_lib.dll +0 -0
mindspore/nn/extend/basic.py +0 -140
mindspore/nn/extend/embedding.py +0 -143
mindspore/nn/extend/layer/normalization.py +0 -109
mindspore/nn/extend/pooling.py +0 -117
mindspore/nn/layer/embedding_service.py +0 -531
mindspore/ops/_op_impl/aicpu/strided_slice_v2.py +0 -93
mindspore/ops/_op_impl/aicpu/strided_slice_v2_grad.py +0 -66
mindspore/ops/extend/__init__.py +0 -53
mindspore/ops/extend/array_func.py +0 -218
mindspore/ops/extend/math_func.py +0 -76
mindspore/ops/extend/nn_func.py +0 -308
mindspore/ops/silent_check.py +0 -162
mindspore/profiler/parser/msadvisor_analyzer.py +0 -82
mindspore/profiler/parser/msadvisor_parser.py +0 -240
mindspore/train/callback/_mindio_ttp.py +0 -443
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/entry_points.txt +0 -0
{mindspore-2.3.0.dist-info → mindspore-2.4.1.dist-info}/top_level.txt +0 -0

mindspore/parallel/_auto_parallel_context.py CHANGED Viewed

@@ -76,6 +76,7 @@ class _PipelineConfig:
 class _PipelineScheduler:
     PIPELINE_1F1B = "1f1b"
     PIPELINE_GPIPE = "gpipe"
+    PIPELINE_SEQPIPE = "seqpipe"
 class _AutoParallelContext:
@@ -168,6 +169,24 @@ class _AutoParallelContext:
         self.check_context_handle()
         return _ParallelFusionConfig.CONFIG
+    def set_dump_local_norm(self, dump_local_norm):
+        """
+        Set dump local norm for auto parallel.
+        Args:
+            dump_local_norm (bool): User need to specify if he want to dump local norm.  Default: False
+        Raises:
+            KeyError: When key of comm_fusion is not 'allreduce'.
+        """
+        self.check_context_handle()
+        self._context_handle.set_dump_local_norm(dump_local_norm)
+    def get_dump_local_norm(self):
+        """Get dump local norm."""
+        self.check_context_handle()
+        return self._context_handle.get_dump_local_norm()
     def set_fusion_threshold_mb(self, fusion_threshold=64, comm_type="allreduce"):
         """
         Set fusion threshold (MB) for auto parallel.
@@ -584,7 +603,7 @@ class _AutoParallelContext:
         self.check_context_handle()
         dir_path = os.path.dirname(strategy_ckpt_save_file)
         if dir_path and not os.path.exists(dir_path):
-            os.makedirs(dir_path, exist_ok=True)
+            os.makedirs(dir_path, mode=0o700, exist_ok=True)
         self._context_handle.set_strategy_ckpt_save_file(strategy_ckpt_save_file)
     def get_strategy_ckpt_save_file(self):
@@ -643,7 +662,7 @@ class _AutoParallelContext:
         self.check_context_handle()
         dir_path = os.path.dirname(group_ckpt_save_file)
         if dir_path and not os.path.exists(dir_path):
-            os.makedirs(dir_path)
+            os.makedirs(dir_path, mode=0o700, exist_ok=True)
         self._context_handle.set_group_ckpt_save_file(group_ckpt_save_file)
     def get_parameter_broadcast_is_set(self):
@@ -896,7 +915,8 @@ class _AutoParallelContext:
             pipeline_config[pp_interleave])
         Validator.check_string(pipeline_config[pp_scheduler], [_PipelineScheduler.PIPELINE_1F1B,
-                                                               _PipelineScheduler.PIPELINE_GPIPE])
+                                                               _PipelineScheduler.PIPELINE_GPIPE,
+                                                               _PipelineScheduler.PIPELINE_SEQPIPE])
         if not pipeline_config[pp_interleave] and pipeline_config[pp_scheduler] != _PipelineScheduler.PIPELINE_1F1B:
             raise ValueError(f"When pipeline_interleave is False, {pp_scheduler} is not supported")
@@ -1117,9 +1137,9 @@ class _AutoParallelContext:
         """
         self.check_context_handle()
         if comm_type == "allgather" and not self.get_enable_all_gather_fusion():
-            return
+            self.set_enable_all_gather_fusion(True)
         if comm_type == "reducescatter" and not self.get_enable_reduce_scatter_fusion():
-            return
+            self.set_enable_reduce_scatter_fusion(True)
         if not isinstance(comm_fusion, dict):
             raise TypeError("For 'comm_fusion', {} config must be dict, but got the type : {}.".format(
                 comm_type, type(comm_fusion)))
@@ -1153,7 +1173,7 @@ class _AutoParallelContext:
         """
         self.check_context_handle()
         if not self.get_enable_all_reduce_fusion():
-            return
+            self.set_enable_all_reduce_fusion(True)
         if not isinstance(comm_fusion, dict):
             raise TypeError("For 'comm_fusion', the 'allreduce' config must be dict, but got the type : {}.".format(
                 type(comm_fusion)))
@@ -1210,7 +1230,7 @@ def _set_ops_strategy_json_config(type="SAVE", path="", mode="all"):
     """
     dir_path = os.path.dirname(path)
     if dir_path and not os.path.exists(dir_path):
-        os.makedirs(dir_path)
+        os.makedirs(dir_path, mode=0o700, exist_ok=True)
     check_type = ["SAVE", "LOAD"]
     check_mode = ["all", "principal"]
     if type in check_type and mode in check_mode:
@@ -1266,7 +1286,8 @@ _set_auto_parallel_context_func_map = {
     "sharding_propagation": auto_parallel_context().set_sharding_propagation,
     "enable_alltoall": auto_parallel_context().set_enable_alltoall,
     "strategy_ckpt_config": auto_parallel_context().set_strategy_ckpt_config,
-    "comm_fusion": auto_parallel_context().set_comm_fusion}
+    "comm_fusion": auto_parallel_context().set_comm_fusion,
+    "dump_local_norm": auto_parallel_context().set_dump_local_norm}
 _get_auto_parallel_context_func_map = {
     "device_num": auto_parallel_context().get_device_num,
@@ -1298,7 +1319,8 @@ _get_auto_parallel_context_func_map = {
     "enable_alltoall": auto_parallel_context().get_enable_alltoall,
     "comm_fusion": auto_parallel_context().get_comm_fusion,
     "strategy_ckpt_config": auto_parallel_context().get_strategy_ckpt_config,
-    "full_batch_is_set": auto_parallel_context().get_full_batch_is_set}
+    "full_batch_is_set": auto_parallel_context().get_full_batch_is_set,
+    "dump_local_norm": auto_parallel_context().get_dump_local_norm}
 @args_type_check(device_num=int, global_rank=int, gradients_mean=bool, gradient_fp32_sync=bool,

mindspore/parallel/_cell_wrapper.py CHANGED Viewed

@@ -16,11 +16,16 @@
 from __future__ import absolute_import
 from __future__ import division
+import numpy as np
+from mindspore import context
 from mindspore.nn.cell import Cell
 from mindspore.ops import operations as P
 from mindspore.ops.operations.comm_ops import AllGather
 from mindspore.communication import GlobalComm
 from mindspore.common import jit
+from mindspore.communication import create_group
+from mindspore.train._utils import get_parameter_redundancy, remove_param_redundancy
 _ALLGATHER_CELL = None
@@ -30,6 +35,7 @@ class AllGatherCell(Cell):
     Allgather cell, used in model parallel scenario.
     To allgather the selected parameter slice from each device.
     """
     def __init__(self, group, do_reshape, after_reshape_slice_shape):
         super(AllGatherCell, self).__init__(auto_prefix=False)
         self.allgather = AllGather(group)
@@ -54,6 +60,7 @@ class SaveOptShardCkptCell(Cell):
     Note:
         This could be optimized later with less communication consumption.
     """
     def __init__(self, group, do_reshape, after_reshape_slice_shape):
         super(SaveOptShardCkptCell, self).__init__(auto_prefix=False)
         self.allgather1 = AllGather(group)
@@ -71,6 +78,21 @@ class SaveOptShardCkptCell(Cell):
         return x
+class SingleCommunicator(Cell):
+    """
+    Used to broadcast single parameter.
+    """
+    def __init__(self, group_name):
+        super(SingleCommunicator, self).__init__()
+        self.allreduce = P.AllReduce(group=group_name)
+        self.add_flags(skip_auto_parallel_compile=True)
+    def construct(self, loaded_param):
+        result = self.allreduce(loaded_param)
+        return result
 def get_allgather_cell(group, need_merge_twice=False, do_reshape=False, after_reshape_slice_shape=()):
     """Get AllGatherCell object."""
     global _ALLGATHER_CELL
@@ -89,3 +111,66 @@ def destroy_allgather_cell():
     global _ALLGATHER_CELL
     if _ALLGATHER_CELL:
         _ALLGATHER_CELL = None
+def _chang_parallel_context(origin_dataset_strategy):
+    """Change the original parallel state."""
+    if context.get_context("mode") == context.GRAPH_MODE:
+        context.set_auto_parallel_context(parallel_mode="hybrid_parallel")
+        if origin_dataset_strategy != "data_parallel":
+            context.set_auto_parallel_context(dataset_strategy="data_parallel")
+def _restore_parallel_context(origin_parallel_mode, origin_dataset_strategy):
+    """Restore the original parallel state."""
+    if context.get_context("mode") == context.GRAPH_MODE:
+        context.set_auto_parallel_context(parallel_mode=origin_parallel_mode)
+        if origin_dataset_strategy != "data_parallel":
+            if origin_dataset_strategy is not None and isinstance(origin_dataset_strategy, list):
+                origin_dataset_strategy = tuple(tuple(ds_item) for ds_item in origin_dataset_strategy)
+            context.set_auto_parallel_context(dataset_strategy=origin_dataset_strategy)
+def _single_parameter_broadcast(net, layout, cur_rank=0, initial_rank=0):
+    """
+    Broadcast single parameter to other rank in data parallel dimension.
+    """
+    from mindspore import Tensor
+    origin_parallel_mode = context.get_auto_parallel_context("parallel_mode")
+    origin_dataset_strategy = context.get_auto_parallel_context("dataset_strategy")
+    if layout:
+        param_redundancy = get_parameter_redundancy(layout, initial_rank)
+    else:
+        param_redundancy = get_parameter_redundancy(net)
+    if not param_redundancy:
+        return
+    single_params = remove_param_redundancy(param_redundancy)
+    if not single_params:
+        return
+    param_redundancy_reversed = {}
+    for key, redundancy in param_redundancy.items():
+        for item in redundancy:
+            if len(item) == 1:
+                continue
+            if cur_rank in item:
+                param_redundancy_reversed.setdefault(item, []).append(key)
+    if not param_redundancy_reversed or cur_rank not in single_params:
+        return
+    net_param_dict = net.parameters_dict()
+    _chang_parallel_context(origin_dataset_strategy)
+    for group, params in param_redundancy_reversed.items():
+        create_group(str(group), list(group))
+        allreduce_input = []
+        for param in params:
+            if param not in net_param_dict:
+                continue
+            real_param = net_param_dict[param]
+            if param not in single_params[cur_rank]:
+                real_param.set_data(Tensor(np.zeros(real_param.shape), dtype=real_param.dtype), real_param.sliced)
+            allreduce_input.append(real_param)
+        if not allreduce_input:
+            continue
+        communicator = SingleCommunicator(str(group))
+        for real_param in allreduce_input:
+            real_param.set_data(communicator(real_param), real_param.sliced)
+    _restore_parallel_context(origin_parallel_mode, origin_dataset_strategy)

mindspore/parallel/_parallel_serialization.py CHANGED Viewed

@@ -24,7 +24,6 @@ from mindspore.parallel._tensor import _get_tensor_strategy, _construct_from_to_
     _generate_transform_operator_stack, _apply_tensor_transform_operators, _construct_tensor_layout_for_opt_shard, \
     _extract_layout_item
 MAX_PATH_LENGTH = 1024
@@ -37,14 +36,17 @@ def _convert_to_list(strategy, rank_id=None):
             dev_mat = list(layout.dev_matrix[0].dim)
             tensor_map = list(layout.tensor_map[0].dim)
             param_split_shape = list(layout.param_split_shape[0].dim)
+            field_size = int(layout.field)
+            shard_stride = int(layout.opt_weight_shard_step)
+            shard_size = int(layout.opt_weight_shard_size)
             pipeline_stage = 0
             origin_param_name = param_name
             if "-" in param_name:
                 pipeline_stage, origin_param_name = param_name.split("-")
                 pipeline_stage = int(pipeline_stage)
             if origin_param_name not in train_map:
-                train_map[origin_param_name] = [dev_mat, tensor_map, param_split_shape, int(layout.field),
-                                                int(layout.opt_weight_shard_step), int(layout.opt_weight_shard_size),
+                train_map[origin_param_name] = [dev_mat, tensor_map, param_split_shape, field_size,
+                                                shard_stride, shard_size,
                                                 [pipeline_stage]]
             else:
                 update_pipeline_stage_list = train_map.get(origin_param_name)[6] + [pipeline_stage]
@@ -54,15 +56,15 @@ def _convert_to_list(strategy, rank_id=None):
                     not_device0_nor_pipeline0 = ((rank_id // stage_device_num) > 0) and (pipeline_stage > 0)
                     if is_device0_and_pipeline0 or not_device0_nor_pipeline0:
                         train_map[origin_param_name] = [dev_mat, tensor_map, param_split_shape,
-                                                        int(layout.field), int(layout.opt_weight_shard_step),
-                                                        int(layout.opt_weight_shard_size), update_pipeline_stage_list]
+                                                        field_size, shard_stride,
+                                                        shard_size, update_pipeline_stage_list]
                     else:
                         train_map.get(origin_param_name)[6] = update_pipeline_stage_list
                 else:
                     if np.all(pipeline_stage <= np.array(update_pipeline_stage_list)):
                         train_map[origin_param_name] = [dev_mat, tensor_map, param_split_shape,
-                                                        int(layout.field), int(layout.opt_weight_shard_step),
-                                                        int(layout.opt_weight_shard_size), update_pipeline_stage_list]
+                                                        field_size, shard_stride,
+                                                        shard_size, update_pipeline_stage_list]
                     else:
                         train_map.get(origin_param_name)[6] = update_pipeline_stage_list
         except BaseException as e:
@@ -174,6 +176,8 @@ def _build_json_strategy(strategy_filename):
 def _build_searched_strategy(strategy_filename):
     """build searched strategy"""
+    if strategy_filename is None:
+        return strategy_filename
     _check_strategy_file(strategy_filename)
     if strategy_filename[-5:] != ".json":
         return _build_protobuf_strategy(strategy_filename)
@@ -239,7 +243,10 @@ def _extract_layout_map(strategy_file, rank_id=None):
     """Extract layout map"""
     layout_map = None
     if strategy_file is not None:
-        src_strategy = _build_searched_strategy(strategy_file)
+        if not isinstance(strategy_file, dict):
+            src_strategy = _build_searched_strategy(strategy_file)
+        else:
+            src_strategy = strategy_file
         layout_map = _convert_to_list(src_strategy, rank_id)
     return layout_map
@@ -248,7 +255,10 @@ def _extract_pipeline_stage_num(strategy_file):
     """extract pipeline stage num"""
     pipeline_stage_num = 1
     if strategy_file is not None:
-        src_strategy = _build_searched_strategy(strategy_file)
+        if not isinstance(strategy_file, dict):
+            src_strategy = _build_searched_strategy(strategy_file)
+        else:
+            src_strategy = strategy_file
         layout_map = _convert_to_list(src_strategy)
         pipeline_stage_set = set()
         for _, layout in layout_map.items():
@@ -323,7 +333,10 @@ def _get_device_num_from_strategy(strategy_file=None):
     """Get device num from strategy file"""
     if strategy_file is None:
         return 1
-    src_strategy = _build_searched_strategy(strategy_file)
+    if not isinstance(strategy_file, dict):
+        src_strategy = _build_searched_strategy(strategy_file)
+    else:
+        src_strategy = strategy_file
     strategy_list = _convert_to_list(src_strategy)
     device_mat = list(strategy_list.values())[0][0]
     return np.prod(device_mat)
@@ -341,14 +354,15 @@ def _rank_list_for_transform_parallel_checkpoint(rank_id, src_strategy_list, dst
         from_dev_matrix, from_tensor_map, from_opt_shard_step, from_opt_shard_size = _extract_layout_item(
             src_strategy_list.get(param_name))
         from_device_num = np.prod(from_dev_matrix)
-        fake_tensor_shape = [8] * len(from_tensor_map)
         to_dev_matrix = [1]
-        to_tensor_map = [-1] * len(fake_tensor_shape)
+        to_tensor_map = [-1] * len(from_tensor_map)
         to_opt_shard_step = 0
         to_opt_shard_size = 0
         if dst_strategy_list is not None:
             to_dev_matrix, to_tensor_map, to_opt_shard_step, to_opt_shard_size = _extract_layout_item(
                 dst_strategy_list.get(param_name))
+        to_device_num = np.prod(to_dev_matrix)
+        fake_tensor_shape = [max(from_device_num, to_device_num)] * len(from_tensor_map)
         handled_key = (from_dev_matrix, from_tensor_map, from_opt_shard_step, from_opt_shard_size,
                        to_dev_matrix, to_tensor_map, to_opt_shard_step, to_opt_shard_size)
         if handled_key in handled_layout:
@@ -433,7 +447,6 @@ def _transform_parallel_checkpoint(rank_id, param_total_dict, param_attr_dict, s
         param_rank_map = _get_needed_rank_transform_operator_map_by_layouts(from_tensor_layout, to_tensor_layout,
                                                                             device_list, rank_id)
         from_info_tuple = (from_opt_shard_size, from_dev_matrix, from_tensor_map, from_full_tensor_shape)
         to_info_tuple = (to_opt_shard_size, to_dev_matrix_origin, to_tensor_map_origin, origin_tensor_shape)
         _insert_opt_shard_reshape(param_rank_map, from_info_tuple, to_info_tuple)
@@ -443,10 +456,10 @@ def _transform_parallel_checkpoint(rank_id, param_total_dict, param_attr_dict, s
         transform_tensor = ms.Tensor(param_total_dict_copy[rank_id % device_num])
         requires_grad = param_attr_dict[param_name][rank_id % device_num][0]
         layerwise_parallel = param_attr_dict[param_name][rank_id % device_num][1]
-        transform_para = ms.Parameter(transform_tensor, param_name, requires_grad, layerwise_parallel)
+        transform_param = ms.Parameter(transform_tensor, param_name, requires_grad, layerwise_parallel)
         if param_type_dict[param_name][rank_id % device_num] == "BFloat16":
-            transform_para.set_dtype(ms.bfloat16)
-        transform_param_dict[param_name] = transform_para
+            transform_param.set_dtype(ms.bfloat16)
+        transform_param_dict[param_name] = transform_param
     if device_num < 1:
         raise ValueError("None of the parameters in checkpoint file are in either src strategy or "
                          "dst strategy. Please check correctness of strategy files.")
@@ -454,13 +467,13 @@ def _transform_parallel_checkpoint(rank_id, param_total_dict, param_attr_dict, s
     # Handle those parameter like learning_rate, global_step which not in strategy_file.
     for param_name, _ in param_total_dict.items():
         if param_name not in transform_param_dict:
-            transform_para = ms.Parameter(
+            transform_param = ms.Parameter(
                 ms.Tensor(param_total_dict[param_name][rank_id % device_num]), param_name,
                 param_attr_dict[param_name][rank_id % device_num][0],
                 param_attr_dict[param_name][rank_id % device_num][1])
             if param_type_dict[param_name][rank_id % device_num] == "BFloat16":
-                transform_para.set_dtype(ms.bfloat16)
-            transform_param_dict[param_name] = transform_para
+                transform_param.set_dtype(ms.bfloat16)
+            transform_param_dict[param_name] = transform_param
     transform_param_list = [{"name": param_name, "data": param_data}
                             for param_name, param_data in transform_param_dict.items()]
@@ -531,3 +544,18 @@ def _insert_opt_shard_reshape(param_rank_map, from_info_tuple, to_info_tuple):
                     continue
                 to_slice_tensor_shape += (item // to_tensor_strategy[i],)
             param_rank_map.get(param_rank).append(('Reshape', list(to_slice_tensor_shape)))
+def _get_param_list_when_first_dim_sharded(device_arrangement, first_dim_sharded_device_index, rank):
+    """Calculate rank list for optimizer parallel when first dim of parameter is sharded by other parallel method"""
+    total_device_num = 1
+    for n in device_arrangement:
+        total_device_num *= n
+    if first_dim_sharded_device_index != len(device_arrangement) - 1:
+        return list(range(0, total_device_num))
+    first_dim_sharded_size = device_arrangement[-1 - first_dim_sharded_device_index]
+    range_size = total_device_num // first_dim_sharded_size
+    offset = rank % range_size
+    start = rank - offset
+    param_total_list = list(range(start, start + range_size))
+    return param_total_list

mindspore/parallel/_tensor.py CHANGED Viewed

@@ -334,8 +334,10 @@ def _extract_layout_item(layout_item):
     tensor_map = layout_item[1]
     opt_shard_step = layout_item[4]
     opt_shard_size = layout_item[5]
+    tensor_strategy = _get_tensor_strategy(dev_matrix, tensor_map)
+    model_parallel_shard_size = np.prod(tensor_strategy)
     if opt_shard_size == -1:
-        opt_shard_size = np.prod(dev_matrix) // opt_shard_step
+        opt_shard_size = np.prod(dev_matrix) // model_parallel_shard_size
     return dev_matrix, tensor_map, opt_shard_step, opt_shard_size
@@ -406,12 +408,35 @@ def _construct_tensor_layout_for_opt_shard(dev_matrix, tensor_map, opt_shard_ste
     if opt_shard_step == 0 or opt_shard_size == 0:
         return dev_matrix, tensor_map, list(origin_full_tensor_shape)
     tensor_strategy = _get_tensor_strategy(dev_matrix, tensor_map)
-    model_parallel_shard_size = np.prod(tensor_strategy)
-    if model_parallel_shard_size != opt_shard_step:
+    repeated_dim = []
+    dev_sharded_index = []
+    for dim in tensor_map:
+        if dim != -1:
+            dev_sharded_index.append(len(dev_matrix) - dim - 1)
+    for index, value in enumerate(dev_matrix):
+        if index not in dev_sharded_index and value > 1:
+            repeated_dim.append(index)
+    if not repeated_dim:
+        raise ValueError("The device_matrix {} and tensor_map {} cannot sharding opt_shard".
+                         format(dev_matrix, tensor_map))
+    if len(repeated_dim) == 1 and np.prod(dev_matrix[repeated_dim[0] + 1:]) != opt_shard_step:
         raise ValueError("The optimizer sharding step {} is not equal to the model parallel sharding size {}.".
-                         format(opt_shard_step, model_parallel_shard_size))
+                         format(opt_shard_step, np.prod(dev_matrix[repeated_dim[0] + 1:])))
     first_dim_no_sharding_size = origin_full_tensor_shape[0] // tensor_strategy[0]
+    if (len(repeated_dim) < len(dev_matrix) and len(repeated_dim) > 1) or repeated_dim[0] > 0:
+        tensor_shape_new = list(origin_full_tensor_shape)
+        tensor_shape_new[0] = tensor_strategy[0]
+        accu_shp = 1
+        for i in range(len(repeated_dim) - 1):
+            opt_sharding_size = dev_matrix[repeated_dim[i]]
+            tensor_shape_new.insert(i + 1, opt_sharding_size)
+            accu_shp = accu_shp * opt_sharding_size
+        tensor_shape_new.insert(len(repeated_dim), first_dim_no_sharding_size // accu_shp)
+        tensor_map_new = list(copy.deepcopy(tensor_map))
+        for index, r_dim in enumerate(repeated_dim):
+            tensor_map_new.insert(index + 1, len(dev_matrix) - r_dim - 1)
+        return list(dev_matrix), tensor_map_new, tensor_shape_new
     full_tensor_shape = list(origin_full_tensor_shape)
     full_tensor_shape[0] = tensor_strategy[0]
     full_tensor_shape.insert(1, first_dim_no_sharding_size)
@@ -452,7 +477,7 @@ def _get_needed_rank_transform_operator_map_by_layouts(from_tensor_layout, to_te
     result_map = {self_rank: transform_operators}
     for operators in transform_operators:
         op_name = operators[0]
-        if op_name == "AllGather":
+        if op_name == "AllConcat":
             groups = operators[1][:-1]
             stack.append((index, groups))
             index += 1
@@ -466,7 +491,7 @@ def _get_needed_rank_transform_operator_map_by_layouts(from_tensor_layout, to_te
                 index = 0
                 for operators in new_transform_operators:
                     op_name = operators[0]
-                    if op_name == "AllGather" and index < group_info[0]:
+                    if op_name == "AllConcat" and index < group_info[0]:
                         groups = operators[1][:-1]
                         stack.insert(0, (index, groups))
                         index += 1
@@ -491,7 +516,7 @@ def _generate_transform_operator_stack(transform_operators_map, self_rank):
         level = queue_front[1]
         current_operator = queue_front[2]
         if level >= 1:
-            if current_operator[0] == "AllGather":
+            if current_operator[0] == "AllConcat":
                 current_group = current_operator[1][:-1]
                 for rank_id in current_group:
                     handle_queue.append((rank_id, level - 1, transform_operators_map[rank_id][level - 1]))
@@ -523,7 +548,7 @@ def _apply_tensor_transform_operators(transform_operator_stack, tensor_dict, dev
                 if operator[0] != op_name:
                     raise ValueError("The operator in the same level should be equal in the transform tensor operator "
                                      "list, but the find {} and {} in level {}".format(op_name, operator[0], cur_level))
-                if operator[0] != "AllGather":
+                if operator[0] != "AllConcat":
                     tensor_dict[rank_id % device_num] = _apply_operator(operator[0])(tensor_dict[rank_id % device_num],
                                                                                      operator)
                     continue
@@ -532,7 +557,7 @@ def _apply_tensor_transform_operators(transform_operator_stack, tensor_dict, dev
                         raise ValueError("The checkpoint file of rank {} is missing.".format(rank % device_num))
                 allgather_list = [tensor_dict[rank % device_num] for rank in operator[1][:-1]]
                 tmp_tensor_dict[rank_id % device_num] = _apply_operator(operator[0])(allgather_list, operator)
-            if op_name == "AllGather":
+            if op_name == "AllConcat":
                 for rank, value in tmp_tensor_dict.items():
                     tensor_dict[rank % device_num] = value
             level_operators.clear()
@@ -565,6 +590,8 @@ def _apply_operator(operator_name):
         Returns:
             The data of tensor after apply operator.
         """
+        if str(type(numpy_data)) == "<class 'builtins.PySafeSlice'>":
+            numpy_data = numpy_data[:]
         if not isinstance(numpy_data, np.ndarray):
             raise TypeError("The data should be a numpy.ndarray.")
         _check_operator(reshape_op)
@@ -604,8 +631,6 @@ def _apply_operator(operator_name):
         Returns:
             The data of tensor after apply operator.
         """
-        if not isinstance(numpy_data, np.ndarray):
-            raise TypeError("The data should be a numpy.ndarray.")
         _check_operator(slice_op)
         if len(slice_op[1]) % 3 != 0:
             raise ValueError("The slice operator information is wrong.")
@@ -621,7 +646,7 @@ def _apply_operator(operator_name):
         return numpy_data[slice_index]
     _apply_operator_map = {"Reshape": _apply_reshape_operator, "StridedSlice": _apply_slice_operator,
-                           "AllGather": _apply_allconcat_operator}
+                           "AllConcat": _apply_allconcat_operator}
     return _apply_operator_map.get(operator_name)
@@ -658,3 +683,92 @@ def _reshape_param_data_with_weight(param_data, dev_mat, field_size):
     for i in range(1, len(tensor_slices_col)):
         new_tensor = np.concatenate((new_tensor, np.array(tensor_slices_col[i]).reshape(-1, 1)), axis=1)
     return Tensor(new_tensor)
+def _load_tensor_shape(dev_mat, tensor_map, full_shape=None, rank_id=-1):
+    """get tensor shape by slice"""
+    if rank_id == -1:
+        rank = get_rank()
+    else:
+        rank = rank_id
+    tensor_strategy = _get_tensor_strategy(dev_mat, tensor_map)
+    tensor_slice_index = _get_tensor_slice_index(dev_mat, tensor_strategy, tensor_map, rank)
+    np_tensor_list = _chunk_shape_by_strategy(full_shape, tensor_strategy)
+    np_tensor_slice_index = np_tensor_list[int(tensor_slice_index)]
+    res = []
+    for index in np_tensor_slice_index:
+        res.append(slice(index[0], index[1]))
+    return tuple(res)
+def _count_tensor_shape(dev_mat, tensor_map, full_shape=None, rank_id=-1):
+    """get tensor shape"""
+    if rank_id == -1:
+        rank = get_rank()
+    else:
+        rank = rank_id
+    tensor_strategy = _get_tensor_strategy(dev_mat, tensor_map)
+    tensor_slice_index = _get_tensor_slice_index(dev_mat, tensor_strategy, tensor_map, rank)
+    np_tensor_list = _chunk_shape_by_strategy(full_shape, tensor_strategy)
+    np_tensor_slice_index = np_tensor_list[int(tensor_slice_index)]
+    res = []
+    for index in np_tensor_slice_index:
+        res.append(index[1] - index[0])
+    return res
+def _load_tensor_shape_by_layout(tensor, layout, rank_id):
+    """get tensor shape by layout"""
+    if not isinstance(layout, tuple):
+        raise TypeError("The layout should be tuple! layout is {}".format(layout))
+    if len(layout) < 7:
+        raise ValueError("The length of layout must be larger than 6! layout is {}".format(layout))
+    slice_shape = layout[2]
+    if slice_shape:
+        return slice_shape
+    tensor_map = layout[1]
+    if not tensor_map:
+        return tensor.shape
+    dev_mat = layout[0]
+    uniform_split = layout[4]
+    group = layout[5]
+    full_shape = layout[6]
+    if not full_shape:
+        full_shape = tensor.shape
+    if uniform_split == 0:
+        raise RuntimeError("The load tensor only support uniform split now")
+    tensor_slice_shape = _count_tensor_shape(dev_mat, tensor_map, full_shape, rank_id)
+    if group:
+        # get a totally shard tensor slice for parallel optimizer
+        size = get_group_size(group)
+        tensor_slice_shape[0] //= size
+    return tensor_slice_shape
+def _chunk_shape_by_strategy(full_shape, strategy):
+    """chunk shape by strategy"""
+    shape = []
+    for i in full_shape:
+        shape.append([0, i])
+    return _chunk_shape(shape, strategy, len(strategy))
+def _chunk_shape(np_tensor, strategy, depth):
+    """_chunk shape"""
+    output = []
+    axis = len(np_tensor) - depth
+    left, right = np_tensor[axis]
+    num = strategy[0]
+    chunk_size = (right - left) / num
+    append = [[i, int(i + chunk_size)] for i in range(left, right) if i % chunk_size == 0]
+    np_tensor_new = []
+    for i in append:
+        np_tensor_tmp = copy.deepcopy(np_tensor)
+        np_tensor_tmp[axis] = i
+        np_tensor_new.append(np_tensor_tmp)
+    if depth == 1:
+        return np_tensor_new
+    for ret_ in np_tensor_new:
+        output.extend(
+            _chunk_shape(ret_, strategy[len(strategy) - depth + 1:len(strategy)], depth - 1))
+    return output