mindspore 2.4.1__cp310-cp310-win_amd64.whl → 2.4.10__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (47) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
  3. mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
  4. mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
  5. mindspore/common/api.py +1 -4
  6. mindspore/common/file_system.py +2 -0
  7. mindspore/common/parameter.py +1 -14
  8. mindspore/communication/_comm_helper.py +5 -0
  9. mindspore/context.py +7 -2
  10. mindspore/dataset/engine/datasets_standard_format.py +17 -0
  11. mindspore/dataset/engine/datasets_user_defined.py +27 -1
  12. mindspore/experimental/llm_boost/__init__.py +2 -2
  13. mindspore/experimental/llm_boost/atb/boost_base.py +240 -64
  14. mindspore/experimental/llm_boost/atb/llama_boost.py +46 -29
  15. mindspore/experimental/llm_boost/atb/qwen_boost.py +47 -24
  16. mindspore/include/api/context.h +1 -1
  17. mindspore/include/dataset/constants.h +2 -2
  18. mindspore/mindspore_backend.dll +0 -0
  19. mindspore/mindspore_common.dll +0 -0
  20. mindspore/mindspore_core.dll +0 -0
  21. mindspore/mindspore_np_dtype.dll +0 -0
  22. mindspore/mindspore_ops.dll +0 -0
  23. mindspore/nn/__init__.py +2 -0
  24. mindspore/nn/cell.py +16 -2
  25. mindspore/nn/layer/conv.py +3 -0
  26. mindspore/nn/layer/pooling.py +8 -10
  27. mindspore/nn/utils/__init__.py +22 -0
  28. mindspore/nn/utils/init.py +71 -0
  29. mindspore/ops/_grad_experimental/grad_comm_ops.py +25 -7
  30. mindspore/ops/auto_generate/gen_ops_prim.py +3 -2
  31. mindspore/ops/function/math_func.py +5 -4
  32. mindspore/ops/operations/comm_ops.py +4 -1
  33. mindspore/ops/operations/custom_ops.py +6 -4
  34. mindspore/ops/operations/nn_ops.py +7 -2
  35. mindspore/parallel/_auto_parallel_context.py +23 -4
  36. mindspore/parallel/_cell_wrapper.py +22 -3
  37. mindspore/parallel/_utils.py +0 -1
  38. mindspore/run_check/_check_version.py +17 -8
  39. mindspore/train/callback/_tft_register.py +7 -6
  40. mindspore/train/model.py +1 -0
  41. mindspore/train/serialization.py +4 -1
  42. mindspore/version.py +1 -1
  43. {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/METADATA +2 -2
  44. {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/RECORD +47 -45
  45. {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/WHEEL +0 -0
  46. {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/entry_points.txt +0 -0
  47. {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/top_level.txt +0 -0
@@ -15,10 +15,16 @@
15
15
  """llm boost"""
16
16
  import json
17
17
  import mindspore.common.dtype as mstype
18
- from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase
18
+ from mindspore.experimental.llm_boost.atb.boost_base import (
19
+ AtbBoostBase,
20
+ PositionEmbeddingType,
21
+ NormType,
22
+ )
19
23
  from mindspore._c_expression import LlmBoostBinder
20
24
  from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
21
25
 
26
+ CPP_LLAMA_MODEL_CLASS_NAME = "llama_LlamaDecoderModel"
27
+
22
28
 
23
29
  @LlmBoostRegister.register(LlmBoostType.BUILDIN, "Llama")
24
30
  class LlamaBoost(AtbBoostBase):
@@ -30,14 +36,17 @@ class LlamaBoost(AtbBoostBase):
30
36
  self.acl_encoder_operation_inputs = [None] * self.in_tensor_length
31
37
  self.acl_decoder_operation_inputs = [None] * self.in_tensor_length
32
38
  self.atb_encoder_operation = LlmBoostBinder(
33
- "ATB", "llama_parallel_DecoderModel")
39
+ self.backend_name, CPP_LLAMA_MODEL_CLASS_NAME
40
+ )
34
41
  self.atb_decoder_operation = LlmBoostBinder(
35
- "ATB", "llama_parallel_DecoderModel")
42
+ self.backend_name, CPP_LLAMA_MODEL_CLASS_NAME
43
+ )
36
44
 
37
45
  def init(self):
38
46
  """set param"""
39
47
  coder_param = {
40
- "rmsNormEps": self.config.rms_norm_eps,
48
+ "normEps": self.config.rms_norm_eps,
49
+ "normType": NormType.RMS_NORM,
41
50
  "numAttentionHeadsPerRank": self.config.num_heads // self.device_num,
42
51
  "hiddenSizePerAttentionHead": self.head_dim,
43
52
  "numHiddenLayers": self.num_layers,
@@ -46,32 +55,41 @@ class LlamaBoost(AtbBoostBase):
46
55
  "isFA": False,
47
56
  "isBF16": self.dtype == mstype.bfloat16,
48
57
  "packQuantType": [[1, 1] for _ in range(self.num_layers)],
49
- "linearQuantType": [[0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)],
50
- "linearTransposeType": [[1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)],
58
+ "linearQuantType": [
59
+ [0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)
60
+ ],
61
+ "linearTransposeType": [
62
+ [1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)
63
+ ],
51
64
  "isEmbeddingParallel": False,
52
65
  "isLmHeadParallel": not self.config.parallel_config.vocab_emb_dp,
53
66
  "lmHeadTransposeType": 1,
54
- "supportSwiGLU": True,
55
- "kvQuant": self.kv_quant is not None,
67
+ "enableSwiGLU": True,
68
+ "enablekvQuant": self.kv_quant is not None,
56
69
  "rank": self.rank_id,
57
70
  "worldSize": self.device_num,
58
- "backend": "lccl",
71
+ "backend": self.config.communication_backend,
59
72
  "rankTableFile": "",
60
- "positionEmbeddingType": self.position_embedding_type,
73
+ "positionEmbeddingType": PositionEmbeddingType.ROPE,
61
74
  "hiddenSize": self.config.hidden_size,
62
75
  "gemma": False,
63
- "enableAddNorm": True,
64
- "supportCompressHead": False,
76
+ "enableAddNorm": False,
77
+ "enableCompressHead": False,
78
+ "isUnpadInputs": True,
65
79
  }
66
80
  encoder_param = {
67
- **coder_param, "isPrefill": True,
68
- "supportLcoc": True,
69
- "supportSpeculate": False,
70
- "skipWordEmbedding": False
81
+ **coder_param,
82
+ "isPrefill": True,
83
+ "enableLcoc": True,
84
+ "enableSpeculate": False,
85
+ "skipWordEmbedding": False,
86
+ "enableSplitFuse": False,
71
87
  }
72
88
  decoder_param = {
73
- **coder_param, "isPrefill": False, "supportLcoc": False,
74
- "supportSpeculate": False
89
+ **coder_param,
90
+ "isPrefill": False,
91
+ "enableLcoc": False,
92
+ "enableSpeculate": False,
75
93
  }
76
94
  self.atb_encoder_operation.init(json.dumps({**encoder_param}))
77
95
  self.atb_decoder_operation.init(json.dumps({**decoder_param}))
@@ -92,14 +110,15 @@ class LlamaBoost(AtbBoostBase):
92
110
  **kwargs
93
111
  ):
94
112
  """prepare inputs"""
95
- self.acl_param = json.dumps({
96
- "seqLen": seqLen,
97
- })
98
- self.acl_decoder_operation_inputs[0] = self.cast(
99
- input_ids, mstype.int64)
113
+ self.acl_param = json.dumps(
114
+ {
115
+ "seqLen": seqLen,
116
+ }
117
+ )
118
+
119
+ self.acl_decoder_operation_inputs[0] = input_ids
100
120
  self.acl_decoder_operation_inputs[1] = self.placeholder
101
- self.acl_decoder_operation_inputs[2] = self.cast(
102
- position_ids, mstype.int32)
121
+ self.acl_decoder_operation_inputs[2] = position_ids
103
122
  self.acl_decoder_operation_inputs[3] = cos_embed
104
123
  self.acl_decoder_operation_inputs[4] = sin_embed
105
124
  self.acl_decoder_operation_inputs[5] = attention_mask
@@ -108,8 +127,6 @@ class LlamaBoost(AtbBoostBase):
108
127
  self.acl_decoder_operation_inputs[8] = self.placeholder
109
128
  self.acl_decoder_operation_inputs[9] = self.placeholder
110
129
  self.acl_decoder_operation_inputs[10] = self.placeholder
111
- self.acl_decoder_operation_inputs[11] = self.cast(
112
- input_lengths, mstype.int32)
113
- self.acl_decoder_operation_inputs[12] = self.cast(
114
- lm_head_indices, mstype.int64)
130
+ self.acl_decoder_operation_inputs[11] = input_lengths
131
+ self.acl_decoder_operation_inputs[12] = lm_head_indices
115
132
  return self.acl_decoder_operation_inputs, self.acl_param
@@ -15,11 +15,14 @@
15
15
  """llm boost"""
16
16
  import json
17
17
  import mindspore.common.dtype as mstype
18
- from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase
18
+ from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase, NormType
19
19
  from mindspore._c_expression import LlmBoostBinder
20
20
  from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
21
21
 
22
22
 
23
+ CPP_QWEN_MODEL_CLASS_NAME = "qwen_QwenDecoderModel"
24
+
25
+
23
26
  @LlmBoostRegister.register(LlmBoostType.BUILDIN, "Qwen")
24
27
  class QwenBoost(AtbBoostBase):
25
28
  """QwenBoost class"""
@@ -30,9 +33,11 @@ class QwenBoost(AtbBoostBase):
30
33
  self.acl_encoder_operation_inputs = [None] * self.in_tensor_length
31
34
  self.acl_decoder_operation_inputs = [None] * self.in_tensor_length
32
35
  self.atb_encoder_operation = LlmBoostBinder(
33
- "ATB", "qwen_DecoderModel")
36
+ self.backend_name, CPP_QWEN_MODEL_CLASS_NAME
37
+ )
34
38
  self.atb_decoder_operation = LlmBoostBinder(
35
- "ATB", "qwen_DecoderModel")
39
+ self.backend_name, CPP_QWEN_MODEL_CLASS_NAME
40
+ )
36
41
 
37
42
  def init(self):
38
43
  """set param"""
@@ -42,24 +47,43 @@ class QwenBoost(AtbBoostBase):
42
47
  "withEmbedding": True,
43
48
  "isEmbeddingParallel": True,
44
49
  "isLmHeadParallel": True,
45
- "linearTransposeType": [[1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)],
50
+ "linearTransposeType": [
51
+ [1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)
52
+ ],
46
53
  "lmHeadTransposeType": 1,
47
- "supportSwiGLU": not self.need_nz,
48
- "rmsNormEps": self.config.rms_norm_eps,
54
+ "enableSwiGLU": not self.need_nz,
55
+ "normEps": self.config.rms_norm_eps,
56
+ "normType": NormType.RMS_NORM,
49
57
  "numAttentionHeadsPerRank": self.config.num_heads // self.device_num,
50
58
  "hiddenSizePerAttentionHead": self.head_dim,
51
59
  "numHiddenLayers": self.num_layers,
52
60
  "numKeyValueHeadsPerRank": self.n_kv_heads // self.device_num,
53
61
  "rank": self.rank_id,
54
62
  "worldSize": self.device_num,
55
- "backend": "lccl",
63
+ "backend": self.config.communication_backend,
56
64
  "packQuantType": [[1, 1] for _ in range(self.num_layers)],
57
- "linearQuantType": [[0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)],
58
- "kvQuant": self.kv_quant is not None,
65
+ "linearQuantType": [
66
+ [0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)
67
+ ],
68
+ "linearHasBias": [[True, False, False, False]] * self.num_layers,
69
+ "enableKvQuant": self.kv_quant is not None,
70
+ "enableLora": False,
71
+ "isUnpadInputs": True,
72
+ "enableAddNorm": False,
73
+ }
74
+ encoder_param = {
75
+ **param_dict,
76
+ "isPrefill": True,
77
+ "enableLcoc": False,
78
+ "enableSplitFuse": False,
79
+ }
80
+ decoder_param = {
81
+ **param_dict,
82
+ "isPrefill": False,
83
+ "enableLcoc": False,
84
+ "enableSpeculate": False,
85
+ "enablePrefixCache": False,
59
86
  }
60
- encoder_param = {**param_dict, "isPrefill": True, "supportLcoc": False}
61
- decoder_param = {**param_dict, "isPrefill": False,
62
- "supportLcoc": False, "supportSpeculate": False}
63
87
  self.atb_encoder_operation.init(json.dumps({**encoder_param}))
64
88
  self.atb_decoder_operation.init(json.dumps({**decoder_param}))
65
89
 
@@ -79,13 +103,14 @@ class QwenBoost(AtbBoostBase):
79
103
  **kwargs
80
104
  ):
81
105
  """prepare inputs"""
82
- self.acl_param = json.dumps({
83
- "seqLen": seqLen,
84
- })
85
- self.acl_decoder_operation_inputs[0] = self.cast(
86
- input_ids, mstype.int64)
87
- self.acl_decoder_operation_inputs[1] = self.cast(
88
- position_ids, mstype.int32)
106
+ self.acl_param = json.dumps(
107
+ {
108
+ "seqLen": seqLen,
109
+ }
110
+ )
111
+
112
+ self.acl_decoder_operation_inputs[0] = input_ids
113
+ self.acl_decoder_operation_inputs[1] = position_ids
89
114
  self.acl_decoder_operation_inputs[2] = cos_embed
90
115
  self.acl_decoder_operation_inputs[3] = sin_embed
91
116
  self.acl_decoder_operation_inputs[4] = attention_mask
@@ -93,9 +118,7 @@ class QwenBoost(AtbBoostBase):
93
118
  self.acl_decoder_operation_inputs[6] = slots
94
119
  self.acl_decoder_operation_inputs[7] = self.placeholder
95
120
  self.acl_decoder_operation_inputs[8] = self.placeholder
96
- self.acl_decoder_operation_inputs[9] = self.cast(
97
- input_lengths, mstype.int32)
98
- self.acl_decoder_operation_inputs[10] = self.cast(
99
- lm_head_indices, mstype.int64)
100
- self.acl_decoder_operation_inputs[11] = self.placeholder
121
+ self.acl_decoder_operation_inputs[9] = self.placeholder
122
+ self.acl_decoder_operation_inputs[10] = input_lengths
123
+ self.acl_decoder_operation_inputs[11] = lm_head_indices
101
124
  return self.acl_decoder_operation_inputs, self.acl_param
@@ -236,7 +236,7 @@ std::string DeviceInfoContext::GetProviderDevice() const { return CharToString(G
236
236
  void DeviceInfoContext::SetProviderDevice(const std::string &device) { SetProviderDevice(StringToChar(device)); }
237
237
 
238
238
  /// \brief Derived from DeviceInfoContext, The configuration of the model running auto on the Host Devices, include
239
- /// CPU/GPU/NPU/Ascend310/Ascend910. This option is only valid for MindSpore Lite.
239
+ /// CPU/GPU/NPU/Ascend. This option is only valid for MindSpore Lite.
240
240
  class MS_API AutoDeviceInfo : public DeviceInfoContext {
241
241
  public:
242
242
  /// \brief Get the type of this DeviceInfoContext.
@@ -108,8 +108,8 @@ enum class DATASET_API ManualOffloadMode {
108
108
  enum class DATASET_API MapTargetDevice {
109
109
  kCpu = 0, ///< CPU Device.
110
110
  kGpu, ///< Gpu Device.
111
- kAscend310, ///< Ascend310 Device.
112
- kAscend910B, ///< Ascend910B Device.
111
+ kAscend310, ///<
112
+ kAscend910B, ///<
113
113
  kInvalid = 100
114
114
  };
115
115
 
Binary file
Binary file
Binary file
Binary file
Binary file
mindspore/nn/__init__.py CHANGED
@@ -31,6 +31,7 @@ from mindspore.nn.wrap import *
31
31
  from mindspore.nn.grad import Jvp, Vjp
32
32
  from mindspore.nn.sparse import *
33
33
  from mindspore.nn.reinforcement import *
34
+ from mindspore.nn.utils import *
34
35
 
35
36
  __all__ = ["Cell", "GraphCell"]
36
37
  __all__.extend(layer.__all__)
@@ -43,5 +44,6 @@ __all__.extend(sparse.__all__)
43
44
  __all__.extend(learning_rate_schedule.__all__)
44
45
  __all__.extend(dynamic_lr.__all__)
45
46
  __all__.extend(reinforcement.__all__)
47
+ __all__.extend(utils.__all__)
46
48
 
47
49
  __all__.sort()
mindspore/nn/cell.py CHANGED
@@ -32,7 +32,8 @@ from mindspore import context
32
32
  from mindspore._c_expression import init_pipeline, update_func_graph_hyper_params, Cell_, FuncGraph, MixedPrecisionType
33
33
  from mindspore import _checkparam as Validator
34
34
  from mindspore.common import dtype as mstype
35
- from mindspore.common.api import _cell_graph_executor, _pynative_executor, _get_args_for_run, cells_compile_cache, _no_grad
35
+ from mindspore.common.api import _cell_graph_executor, _pynative_executor, _get_args_for_run, cells_compile_cache, \
36
+ _no_grad
36
37
  from mindspore.common.api import _generate_branch_control_input, _convert_python_data, _get_args_for_run_predict
37
38
  from mindspore.common.api import _process_dyn_args, _generate_dyn_compile_args
38
39
  from mindspore.common.parameter import Parameter, ParameterTuple
@@ -45,6 +46,7 @@ from mindspore._check_jit_forbidden_api import jit_forbidden_register
45
46
  from mindspore.common._decorator import deprecated
46
47
  from mindspore.common._register_for_recompute import recompute_registry
47
48
 
49
+
48
50
  class Cell(Cell_):
49
51
  """
50
52
  The basic building block of neural networks in MindSpore. The model or neural network layer should inherit this
@@ -2582,7 +2584,7 @@ class Cell(Cell_):
2582
2584
  """
2583
2585
  if context.get_context("mode") == context.PYNATIVE_MODE:
2584
2586
  self._recompute_cell = recompute_registry.get()(self.construct)
2585
- self._recompute()
2587
+ self._add_recompute_flag()
2586
2588
  return
2587
2589
  self._recompute()
2588
2590
  if 'mp_comm_recompute' in kwargs.keys():
@@ -2685,6 +2687,18 @@ class Cell(Cell_):
2685
2687
  if hasattr(network, "_amp_level"):
2686
2688
  self._amp_level = getattr(network, "_amp_level")
2687
2689
 
2690
+ def _add_recompute_flag(self):
2691
+ """
2692
+ Set pynative cell recomputed.
2693
+ """
2694
+ if not self._has_config_recompute:
2695
+ self._has_config_recompute = True
2696
+ else:
2697
+ logger.info("The recompute interface can be configured only once."
2698
+ " If the parent cell is configured, the child cell should not be configured")
2699
+ for cell in self.cells():
2700
+ cell._add_recompute_flag()
2701
+
2688
2702
 
2689
2703
  class GraphCell(Cell):
2690
2704
  """
@@ -862,6 +862,9 @@ class Conv3dTranspose(_Conv):
862
862
  However, when `stride` > 1, Conv2d maps multiple input shapes to the same output shape. Deconvolutional network
863
863
  can refer to `Deconvolutional Networks <https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf>`_.
864
864
 
865
+ Note:
866
+ For Atlas A2 training series products, `output_padding` is currently not supported.
867
+
865
868
  Args:
866
869
  in_channels (int): The channel number of the input tensor of the Conv3dTranspose layer.
867
870
  out_channels (int): The channel number of the output tensor of the Conv3dTranspose layer.
@@ -297,6 +297,9 @@ class MaxPool3d(_PoolNd):
297
297
  \max_{l=0, \ldots, d_{ker}-1} \max_{m=0, \ldots, h_{ker}-1} \max_{n=0, \ldots, w_{ker}-1}
298
298
  \text{input}(N_i, C_j, s_0 \times d + l, s_1 \times h + m, s_2 \times w + n)
299
299
 
300
+ .. note::
301
+ For Atlas training series products, this interface is not supported.
302
+
300
303
  Args:
301
304
  kernel_size (Union[int, tuple[int]]): The size of kernel used to take the maximum value,
302
305
  is an int number or a single element tuple that represents depth, height and width of the kernel, or a tuple
@@ -1032,16 +1035,11 @@ class AvgPool2dExt(Cell):
1032
1035
  >>> import numpy as np
1033
1036
  >>> from mindspore import Tensor, nn
1034
1037
  >>> from mindspore import dtype as mstype
1035
- >>> x = Tensor(np.arange(1 * 3 * 3 * 4).reshape(1, 3, 3, 4), mstype.float32)
1036
- >>> m = nn.AvgPool2dExt(x, kernel_size=2, stride=1)
1037
- >>> output = m(x)
1038
- >>> print(output)
1039
- [[[[ 2.5 3.5 4.5]
1040
- [ 6.5 7.5 8.5]]
1041
- [[14.5 15.5 16.5]
1042
- [18.5 19.5 20.5]]
1043
- [[26.5 27.5 28.5]
1044
- [30.5 31.5 32.5]]]]
1038
+ >>> input = Tensor(np.arange(1 * 3 * 3 * 4).reshape(1, 3, 3, 4), mstype.float32)
1039
+ >>> net = nn.AvgPool2dExt(kernel_size=2, stride=1)
1040
+ >>> output = net(input)
1041
+ >>> print(output.shape)
1042
+ (1, 3, 2, 3)
1045
1043
  """
1046
1044
  def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False,
1047
1045
  count_include_pad=True, divisor_override=None):
@@ -0,0 +1,22 @@
1
+ # Copyright 2024 Huawei Technologies Co., Ltd
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ============================================================================
15
+ """
16
+ nn.utils.
17
+ """
18
+ from __future__ import absolute_import
19
+
20
+ from .init import no_init_parameters
21
+
22
+ __all__ = ["no_init_parameters"]
@@ -0,0 +1,71 @@
1
+ # Copyright 2024 Huawei Technologies Co., Ltd
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ============================================================================
15
+
16
+ """init for nn.Cell."""
17
+ from __future__ import absolute_import
18
+
19
+ from contextlib import contextmanager
20
+ from mindspore.common.parameter import Parameter
21
+
22
+
23
+ @contextmanager
24
+ def no_init_parameters():
25
+ r"""
26
+ In scenarios where a checkpoint is loaded, parameters within the network instantiation will be
27
+ instantiated and occupy physical memory. Loading a checkpoint will replace the parameter values.
28
+ Decorator can be applied during network instantiation to add an attribute `init_param` to all
29
+ parameters within the current Cell, setting it to `init_param=False` .
30
+ When `init_param=False` is detected, the initialization of the parameters is skipped,
31
+ and the parameters are assigned values directly from the checkpoint during loading,
32
+ which can optimize performance and reduce physical memory usage.
33
+
34
+ Note:
35
+ Initialization of parameters created with `initializer` can only be skipped.
36
+ Parameters created by `Tensor` or `numpy` cannot be skipped.
37
+
38
+ Examples:
39
+ >>> import mindspore as ms
40
+ >>> from mindspore import nn, ops, load_checkpoint
41
+ >>> from mindspore.common.initializer import initializer
42
+ >>> from mindspore.nn.utils import no_init_parameters
43
+ >>> # 1. Add a decorator to the network that requires delayed initialization
44
+ >>> class Net(nn.Cell):
45
+ ... def __init__(self, in_channels, out_channels):
46
+ ... super().__init__()
47
+ ... self.weight = ms.Parameter(initializer("normal", [in_channels, out_channels], ms.float32))
48
+ ... self.bias = ms.Parameter(initializer("normal", [out_channels], ms.float32))
49
+ ... self.matmul = ops.MatMul()
50
+ ... self.add = ops.Add()
51
+ ...
52
+ ... def construct(self, x):
53
+ ... x = self.matmul(x, self.weight)
54
+ ... x = self.add(x, self.bias)
55
+ ... return x
56
+ >>> with no_init_parameters():
57
+ ... # After instantiation, all parameters in the net are not initialized
58
+ ... net = Net(28*28, 64)
59
+ >>> # 2. Load checkpoint parameters to the net
60
+ >>> load_checkpoint('./checkpoint/test_net.ckpt', net=net)
61
+ >>> # 3. After loading the checkpoint, manually call init_parameters_data() to initialize
62
+ >>> # the uninitialized parameters in the net if need. If the network is executed,
63
+ >>> # the framework will automatically call this interface.
64
+ >>> net.init_parameters_data()
65
+ """
66
+ init_class = Parameter
67
+ setattr(init_class, "init_param", False)
68
+ try:
69
+ yield
70
+ finally:
71
+ setattr(init_class, "init_param", True)
@@ -16,7 +16,7 @@
16
16
  """Generate bprop for comm ops"""
17
17
  from __future__ import division
18
18
  from __future__ import absolute_import
19
- from mindspore import Tensor
19
+ from mindspore import Tensor, Parameter
20
20
  import mindspore.common.dtype as mstype
21
21
  from mindspore.ops import functional as F
22
22
  from mindspore.communication import get_rank, get_group_size
@@ -37,6 +37,9 @@ from mindspore.ops._grad_experimental.grad_base import bprop_getters
37
37
  from mindspore.ops.operations import _grad_ops as G
38
38
  import mindspore as ms
39
39
 
40
+ _device_local_norm = None
41
+ if ms.get_auto_parallel_context("dump_device_local_norm"):
42
+ _device_local_norm = Parameter(Tensor(0.0, mstype.float32), name="_device_local_norm", requires_grad=False)
40
43
 
41
44
  @bprop_getters.register(AllReduce)
42
45
  def get_bprop_all_reduce(self):
@@ -247,10 +250,15 @@ def get_bprop_mirror_micro_step_operator(self):
247
250
  reduce_sum = P.ReduceSum(keep_dims=False)
248
251
  square = P.Square()
249
252
  dump_local_norm = ms.get_auto_parallel_context("dump_local_norm")
253
+ dump_device_local_norm = ms.get_auto_parallel_context("dump_device_local_norm")
250
254
 
251
255
  def bprop(x, z, out, dout):
252
- if dump_local_norm:
253
- z = F.depend(z, ln_print("dump local norm: ", param_name, reduce_sum(square((z)))))
256
+ if dump_local_norm or dump_device_local_norm:
257
+ _norm = reduce_sum(square((z)))
258
+ if dump_local_norm:
259
+ z = F.depend(z, ln_print("dump local norm: ", param_name, _norm))
260
+ if dump_device_local_norm:
261
+ z = F.depend(z, F.assign_add(_device_local_norm, cast(_norm, _device_local_norm.dtype)))
254
262
  real_grad = z
255
263
  assign_out = dout
256
264
  if issubclass_(F.typeof(dout), mstype.tensor_type):
@@ -373,6 +381,7 @@ def get_bprop_micro_step_all_gather(self):
373
381
  reduce_sum = P.ReduceSum(keep_dims=False)
374
382
  square = P.Square()
375
383
  dump_local_norm = ms.get_auto_parallel_context("dump_local_norm")
384
+ dump_device_local_norm = ms.get_auto_parallel_context("dump_device_local_norm")
376
385
 
377
386
  def bprop(x, z, out, dout):
378
387
  if with_mirror_operator:
@@ -383,8 +392,12 @@ def get_bprop_micro_step_all_gather(self):
383
392
  real_grad = F.tensor_mul(real_grad, scale)
384
393
  return (real_grad, cast(out_tensor, dtype(z)))
385
394
  z = F.depend(z, dout)
386
- if dump_local_norm:
387
- z = F.depend(z, ln_print("dump local norm: ", param_name, reduce_sum(square((z)))))
395
+ if dump_local_norm or dump_device_local_norm:
396
+ _norm = reduce_sum(square((z)))
397
+ if dump_local_norm:
398
+ z = F.depend(z, ln_print("dump local norm: ", param_name, _norm))
399
+ if dump_device_local_norm:
400
+ z = F.depend(z, F.assign_add(_device_local_norm, cast(_norm, _device_local_norm.dtype)))
388
401
  if not do_mirror:
389
402
  return (z, cast(out_tensor, dtype(z)))
390
403
  real_grad = reduce_scatter(z)
@@ -586,6 +599,7 @@ def get_bprop_mirror_operator(self):
586
599
 
587
600
  dev_num_r = 1.0
588
601
  dump_local_norm = ms.get_auto_parallel_context("dump_local_norm")
602
+ dump_device_local_norm = ms.get_auto_parallel_context("dump_device_local_norm")
589
603
  if dev_num > 1:
590
604
  dev_num_r = 1.0 / dev_num
591
605
  all_reduce = AllReduce(group=group)
@@ -608,8 +622,12 @@ def get_bprop_mirror_operator(self):
608
622
  all_reduce.set_prim_instance_name(instance_name)
609
623
 
610
624
  def bprop(x, out, dout):
611
- if dump_local_norm:
612
- dout = F.depend(dout, ln_print("dump local norm: ", param_name, reduce_sum(square((dout)))))
625
+ if dump_local_norm or dump_device_local_norm:
626
+ _norm = reduce_sum(square((dout)))
627
+ if dump_local_norm:
628
+ dout = F.depend(dout, ln_print("dump local norm: ", param_name, _norm))
629
+ if dump_device_local_norm:
630
+ dout = F.depend(dout, F.assign_add(_device_local_norm, cast(_norm, _device_local_norm.dtype)))
613
631
 
614
632
  if dev_num == 1:
615
633
  return (dout,)
@@ -2387,7 +2387,8 @@ class BatchMatMul(Primitive):
2387
2387
 
2388
2388
  \text{output}[..., :, :] = \text{matrix}(x[..., :, :]) * \text{matrix}(y[..., :, :])
2389
2389
 
2390
- The rank of both two input tensors must be same and not less than `2`.
2390
+ The rank of the two input tensors must be at least `2`, and the two input tensors must have the same rank
2391
+ if the environment is GPU or CPU.
2391
2392
 
2392
2393
  Args:
2393
2394
  transpose_a (bool): If ``True`` , the last two dimensions of `x` is transposed before multiplication.
@@ -9488,7 +9489,7 @@ class MatMul(Primitive):
9488
9489
 
9489
9490
  .. math::
9490
9491
 
9491
- Output_{i j}=\sum_{k=1}^{p} a_{i k} b_{k j}=a_{i 1} b_{1 j}+a_{i 2} b_{2 j}+\cdots+a_{i p} b_{p j}, p\in N
9492
+ (Output)_{i j}=\sum_{k=1}^{p} a_{i k} b_{k j}=a_{i 1} b_{1 j}+a_{i 2} b_{2 j}+\cdots+a_{i p} b_{p j}, p\in N
9492
9493
 
9493
9494
  where the :math:`i,j` indicates the output of the i-th row and j-th column element.
9494
9495
 
@@ -9088,9 +9088,9 @@ def remainder(input, other):
9088
9088
  both dtypes cannot be bool, and the shapes of them could be broadcast. When the inputs are one tensor
9089
9089
  and one scalar, the scalar could only be a constant.
9090
9090
 
9091
- .. math::
9091
+ .. code:: python
9092
9092
 
9093
- remainder(input, other) = input - input.div(other, rounding\_mode="floor") * other
9093
+ remainder(input, other) == input - input.div(other, rounding_mode="floor") * other
9094
9094
 
9095
9095
  .. warning::
9096
9096
  - When the elements of input exceed 2048, there might be accuracy problems.
@@ -9135,9 +9135,10 @@ def remainder_ext(input, other):
9135
9135
 
9136
9136
  Supports broadcasting to a common shape and implicit type promotion.
9137
9137
 
9138
- .. math::
9138
+ .. code:: python
9139
+
9140
+ remainder(input, other) == input - input.div(other, rounding_mode="floor") * other
9139
9141
 
9140
- remainder(input, other) = input - input.div(other, rounding\_mode="floor") * other
9141
9142
 
9142
9143
  Note:
9143
9144
  Complex inputs are not supported. At least one input need to be tensor, but not both are bool tensors.
@@ -988,6 +988,9 @@ class NeighborExchangeV2(Primitive):
988
988
  in the same subnet, please check the `details \
989
989
  <https://www.mindspore.cn/docs/en/master/api_python/samples/ops/communicate_ops.html#notes>`_.
990
990
 
991
+ Users need to ensure that the length of the received data `recv_lens` is consistent with that of
992
+ the sent data `send_lens`.
993
+
991
994
  Args:
992
995
  send_rank_ids (list(int)): Ranks which the data is sent to. 8 rank_ids represents 8 directions, if one
993
996
  direction is not send to , set it -1.
@@ -1393,7 +1396,7 @@ class Send(PrimitiveWithInfer):
1393
1396
  >>> def __init__(self):
1394
1397
  >>> super(SendNet, self).__init__()
1395
1398
  >>> self.depend = ops.Depend()
1396
- >>> self.send = ops.Send(st_tag=0, dest_rank=8, group="hccl_world_group")
1399
+ >>> self.send = ops.Send(sr_tag=0, dest_rank=8, group="hccl_world_group")
1397
1400
  >>>
1398
1401
  >>> def construct(self, x):
1399
1402
  >>> out = self.depend(x, self.send(x))
@@ -251,11 +251,13 @@ class Custom(ops.PrimitiveWithInfer):
251
251
 
252
252
  - "xxx.so" file generation:
253
253
 
254
- 1) GPU Platform: Given user defined "xxx.cu" file (ex. "{path}/add.cu"), use nvcc command to compile
255
- it.(ex. "nvcc --shared -Xcompiler -fPIC -o add.so add.cu")
254
+ 1) GPU Platform: Given user defined "xxx.cu" file (ex. "{path}/add.cu"),
255
+ use nvcc command to compile
256
+ it.(ex. :code:`nvcc --shared -Xcompiler -fPIC -o add.so add.cu`)
256
257
 
257
- 2) CPU Platform: Given user defined "xxx.cc" file (ex. "{path}/add.cc"), use g++/gcc command to
258
- compile it.(ex. "g++ --shared -fPIC -o add.so add.cc")
258
+ 2) CPU Platform: Given user defined "xxx.cc" file (ex. "{path}/add.cc"),
259
+ use g++/gcc command to
260
+ compile it.(ex. :code:`g++ --shared -fPIC -o add.so add.cc`)
259
261
 
260
262
  - Define a "xxx.cc"/"xxx.cu" file:
261
263