PyPI - mindspore - Versions diffs - 2.4.1__cp310-cp310-win_amd64.whl → 2.4.10__cp310-cp310-win_amd64.whl - Mend

mindspore 2.4.1__cp310-cp310-win_amd64.whl → 2.4.10__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (47) hide show

mindspore/.commit_id +1 -1
mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
mindspore/common/api.py +1 -4
mindspore/common/file_system.py +2 -0
mindspore/common/parameter.py +1 -14
mindspore/communication/_comm_helper.py +5 -0
mindspore/context.py +7 -2
mindspore/dataset/engine/datasets_standard_format.py +17 -0
mindspore/dataset/engine/datasets_user_defined.py +27 -1
mindspore/experimental/llm_boost/__init__.py +2 -2
mindspore/experimental/llm_boost/atb/boost_base.py +240 -64
mindspore/experimental/llm_boost/atb/llama_boost.py +46 -29
mindspore/experimental/llm_boost/atb/qwen_boost.py +47 -24
mindspore/include/api/context.h +1 -1
mindspore/include/dataset/constants.h +2 -2
mindspore/mindspore_backend.dll +0 -0
mindspore/mindspore_common.dll +0 -0
mindspore/mindspore_core.dll +0 -0
mindspore/mindspore_np_dtype.dll +0 -0
mindspore/mindspore_ops.dll +0 -0
mindspore/nn/__init__.py +2 -0
mindspore/nn/cell.py +16 -2
mindspore/nn/layer/conv.py +3 -0
mindspore/nn/layer/pooling.py +8 -10
mindspore/nn/utils/__init__.py +22 -0
mindspore/nn/utils/init.py +71 -0
mindspore/ops/_grad_experimental/grad_comm_ops.py +25 -7
mindspore/ops/auto_generate/gen_ops_prim.py +3 -2
mindspore/ops/function/math_func.py +5 -4
mindspore/ops/operations/comm_ops.py +4 -1
mindspore/ops/operations/custom_ops.py +6 -4
mindspore/ops/operations/nn_ops.py +7 -2
mindspore/parallel/_auto_parallel_context.py +23 -4
mindspore/parallel/_cell_wrapper.py +22 -3
mindspore/parallel/_utils.py +0 -1
mindspore/run_check/_check_version.py +17 -8
mindspore/train/callback/_tft_register.py +7 -6
mindspore/train/model.py +1 -0
mindspore/train/serialization.py +4 -1
mindspore/version.py +1 -1
{mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/METADATA +2 -2
{mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/RECORD +47 -45
{mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/WHEEL +0 -0
{mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/entry_points.txt +0 -0
{mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/top_level.txt +0 -0

mindspore/experimental/llm_boost/atb/llama_boost.py CHANGED Viewed

@@ -15,10 +15,16 @@
 """llm boost"""
 import json
 import mindspore.common.dtype as mstype
-from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase
+from mindspore.experimental.llm_boost.atb.boost_base import (
+    AtbBoostBase,
+    PositionEmbeddingType,
+    NormType,
+)
 from mindspore._c_expression import LlmBoostBinder
 from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
+CPP_LLAMA_MODEL_CLASS_NAME = "llama_LlamaDecoderModel"
 @LlmBoostRegister.register(LlmBoostType.BUILDIN, "Llama")
 class LlamaBoost(AtbBoostBase):
@@ -30,14 +36,17 @@ class LlamaBoost(AtbBoostBase):
         self.acl_encoder_operation_inputs = [None] * self.in_tensor_length
         self.acl_decoder_operation_inputs = [None] * self.in_tensor_length
         self.atb_encoder_operation = LlmBoostBinder(
-            "ATB", "llama_parallel_DecoderModel")
+            self.backend_name, CPP_LLAMA_MODEL_CLASS_NAME
+        )
         self.atb_decoder_operation = LlmBoostBinder(
-            "ATB", "llama_parallel_DecoderModel")
+            self.backend_name, CPP_LLAMA_MODEL_CLASS_NAME
+        )
     def init(self):
         """set param"""
         coder_param = {
-            "rmsNormEps": self.config.rms_norm_eps,
+            "normEps": self.config.rms_norm_eps,
+            "normType": NormType.RMS_NORM,
             "numAttentionHeadsPerRank": self.config.num_heads // self.device_num,
             "hiddenSizePerAttentionHead": self.head_dim,
             "numHiddenLayers": self.num_layers,
@@ -46,32 +55,41 @@ class LlamaBoost(AtbBoostBase):
             "isFA": False,
             "isBF16": self.dtype == mstype.bfloat16,
             "packQuantType": [[1, 1] for _ in range(self.num_layers)],
-            "linearQuantType": [[0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)],
-            "linearTransposeType": [[1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)],
+            "linearQuantType": [
+                [0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)
+            ],
+            "linearTransposeType": [
+                [1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)
+            ],
             "isEmbeddingParallel": False,
             "isLmHeadParallel": not self.config.parallel_config.vocab_emb_dp,
             "lmHeadTransposeType": 1,
-            "supportSwiGLU": True,
-            "kvQuant": self.kv_quant is not None,
+            "enableSwiGLU": True,
+            "enablekvQuant": self.kv_quant is not None,
             "rank": self.rank_id,
             "worldSize": self.device_num,
-            "backend": "lccl",
+            "backend": self.config.communication_backend,
             "rankTableFile": "",
-            "positionEmbeddingType": self.position_embedding_type,
+            "positionEmbeddingType": PositionEmbeddingType.ROPE,
             "hiddenSize": self.config.hidden_size,
             "gemma": False,
-            "enableAddNorm": True,
-            "supportCompressHead": False,
+            "enableAddNorm": False,
+            "enableCompressHead": False,
+            "isUnpadInputs": True,
         }
         encoder_param = {
-            **coder_param, "isPrefill": True,
-            "supportLcoc": True,
-            "supportSpeculate": False,
-            "skipWordEmbedding": False
+            **coder_param,
+            "isPrefill": True,
+            "enableLcoc": True,
+            "enableSpeculate": False,
+            "skipWordEmbedding": False,
+            "enableSplitFuse": False,
         }
         decoder_param = {
-            **coder_param, "isPrefill": False, "supportLcoc": False,
-            "supportSpeculate": False
+            **coder_param,
+            "isPrefill": False,
+            "enableLcoc": False,
+            "enableSpeculate": False,
         }
         self.atb_encoder_operation.init(json.dumps({**encoder_param}))
         self.atb_decoder_operation.init(json.dumps({**decoder_param}))
@@ -92,14 +110,15 @@ class LlamaBoost(AtbBoostBase):
             **kwargs
     ):
         """prepare inputs"""
-        self.acl_param = json.dumps({
-            "seqLen": seqLen,
-        })
-        self.acl_decoder_operation_inputs[0] = self.cast(
-            input_ids, mstype.int64)
+        self.acl_param = json.dumps(
+            {
+                "seqLen": seqLen,
+            }
+        )
+        self.acl_decoder_operation_inputs[0] = input_ids
         self.acl_decoder_operation_inputs[1] = self.placeholder
-        self.acl_decoder_operation_inputs[2] = self.cast(
-            position_ids, mstype.int32)
+        self.acl_decoder_operation_inputs[2] = position_ids
         self.acl_decoder_operation_inputs[3] = cos_embed
         self.acl_decoder_operation_inputs[4] = sin_embed
         self.acl_decoder_operation_inputs[5] = attention_mask
@@ -108,8 +127,6 @@ class LlamaBoost(AtbBoostBase):
         self.acl_decoder_operation_inputs[8] = self.placeholder
         self.acl_decoder_operation_inputs[9] = self.placeholder
         self.acl_decoder_operation_inputs[10] = self.placeholder
-        self.acl_decoder_operation_inputs[11] = self.cast(
-            input_lengths, mstype.int32)
-        self.acl_decoder_operation_inputs[12] = self.cast(
-            lm_head_indices, mstype.int64)
+        self.acl_decoder_operation_inputs[11] = input_lengths
+        self.acl_decoder_operation_inputs[12] = lm_head_indices
         return self.acl_decoder_operation_inputs, self.acl_param

mindspore/experimental/llm_boost/atb/qwen_boost.py CHANGED Viewed

@@ -15,11 +15,14 @@
 """llm boost"""
 import json
 import mindspore.common.dtype as mstype
-from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase
+from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase, NormType
 from mindspore._c_expression import LlmBoostBinder
 from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
+CPP_QWEN_MODEL_CLASS_NAME = "qwen_QwenDecoderModel"
 @LlmBoostRegister.register(LlmBoostType.BUILDIN, "Qwen")
 class QwenBoost(AtbBoostBase):
     """QwenBoost class"""
@@ -30,9 +33,11 @@ class QwenBoost(AtbBoostBase):
         self.acl_encoder_operation_inputs = [None] * self.in_tensor_length
         self.acl_decoder_operation_inputs = [None] * self.in_tensor_length
         self.atb_encoder_operation = LlmBoostBinder(
-            "ATB", "qwen_DecoderModel")
+            self.backend_name, CPP_QWEN_MODEL_CLASS_NAME
+        )
         self.atb_decoder_operation = LlmBoostBinder(
-            "ATB", "qwen_DecoderModel")
+            self.backend_name, CPP_QWEN_MODEL_CLASS_NAME
+        )
     def init(self):
         """set param"""
@@ -42,24 +47,43 @@ class QwenBoost(AtbBoostBase):
             "withEmbedding": True,
             "isEmbeddingParallel": True,
             "isLmHeadParallel": True,
-            "linearTransposeType": [[1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)],
+            "linearTransposeType": [
+                [1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)
+            ],
             "lmHeadTransposeType": 1,
-            "supportSwiGLU": not self.need_nz,
-            "rmsNormEps": self.config.rms_norm_eps,
+            "enableSwiGLU": not self.need_nz,
+            "normEps": self.config.rms_norm_eps,
+            "normType": NormType.RMS_NORM,
             "numAttentionHeadsPerRank": self.config.num_heads // self.device_num,
             "hiddenSizePerAttentionHead": self.head_dim,
             "numHiddenLayers": self.num_layers,
             "numKeyValueHeadsPerRank": self.n_kv_heads // self.device_num,
             "rank": self.rank_id,
             "worldSize": self.device_num,
-            "backend": "lccl",
+            "backend": self.config.communication_backend,
             "packQuantType": [[1, 1] for _ in range(self.num_layers)],
-            "linearQuantType": [[0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)],
-            "kvQuant": self.kv_quant is not None,
+            "linearQuantType": [
+                [0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)
+            ],
+            "linearHasBias": [[True, False, False, False]] * self.num_layers,
+            "enableKvQuant": self.kv_quant is not None,
+            "enableLora": False,
+            "isUnpadInputs": True,
+            "enableAddNorm": False,
+        }
+        encoder_param = {
+            **param_dict,
+            "isPrefill": True,
+            "enableLcoc": False,
+            "enableSplitFuse": False,
+        }
+        decoder_param = {
+            **param_dict,
+            "isPrefill": False,
+            "enableLcoc": False,
+            "enableSpeculate": False,
+            "enablePrefixCache": False,
         }
-        encoder_param = {**param_dict, "isPrefill": True, "supportLcoc": False}
-        decoder_param = {**param_dict, "isPrefill": False,
-                         "supportLcoc": False, "supportSpeculate": False}
         self.atb_encoder_operation.init(json.dumps({**encoder_param}))
         self.atb_decoder_operation.init(json.dumps({**decoder_param}))
@@ -79,13 +103,14 @@ class QwenBoost(AtbBoostBase):
             **kwargs
     ):
         """prepare inputs"""
-        self.acl_param = json.dumps({
-            "seqLen": seqLen,
-        })
-        self.acl_decoder_operation_inputs[0] = self.cast(
-            input_ids, mstype.int64)
-        self.acl_decoder_operation_inputs[1] = self.cast(
-            position_ids, mstype.int32)
+        self.acl_param = json.dumps(
+            {
+                "seqLen": seqLen,
+            }
+        )
+        self.acl_decoder_operation_inputs[0] = input_ids
+        self.acl_decoder_operation_inputs[1] = position_ids
         self.acl_decoder_operation_inputs[2] = cos_embed
         self.acl_decoder_operation_inputs[3] = sin_embed
         self.acl_decoder_operation_inputs[4] = attention_mask
@@ -93,9 +118,7 @@ class QwenBoost(AtbBoostBase):
         self.acl_decoder_operation_inputs[6] = slots
         self.acl_decoder_operation_inputs[7] = self.placeholder
         self.acl_decoder_operation_inputs[8] = self.placeholder
-        self.acl_decoder_operation_inputs[9] = self.cast(
-            input_lengths, mstype.int32)
-        self.acl_decoder_operation_inputs[10] = self.cast(
-            lm_head_indices, mstype.int64)
-        self.acl_decoder_operation_inputs[11] = self.placeholder
+        self.acl_decoder_operation_inputs[9] = self.placeholder
+        self.acl_decoder_operation_inputs[10] = input_lengths
+        self.acl_decoder_operation_inputs[11] = lm_head_indices
         return self.acl_decoder_operation_inputs, self.acl_param

mindspore/include/api/context.h CHANGED Viewed

@@ -236,7 +236,7 @@ std::string DeviceInfoContext::GetProviderDevice() const { return CharToString(G
 void DeviceInfoContext::SetProviderDevice(const std::string &device) { SetProviderDevice(StringToChar(device)); }
 /// \brief Derived from DeviceInfoContext, The configuration of the model running auto on the Host Devices, include
-/// CPU/GPU/NPU/Ascend310/Ascend910. This option is only valid for MindSpore Lite.
+/// CPU/GPU/NPU/Ascend. This option is only valid for MindSpore Lite.
 class MS_API AutoDeviceInfo : public DeviceInfoContext {
  public:
   /// \brief Get the type of this DeviceInfoContext.

mindspore/include/dataset/constants.h CHANGED Viewed

@@ -108,8 +108,8 @@ enum class DATASET_API ManualOffloadMode {
 enum class DATASET_API MapTargetDevice {
   kCpu = 0,     ///< CPU Device.
   kGpu,         ///< Gpu Device.
-  kAscend310,   ///< Ascend310 Device.
-  kAscend910B,  ///< Ascend910B Device.
+  kAscend310,   ///<
+  kAscend910B,  ///<
   kInvalid = 100
 };

mindspore/mindspore_backend.dll CHANGED Viewed

Binary file

mindspore/mindspore_common.dll CHANGED Viewed

Binary file

mindspore/mindspore_core.dll CHANGED Viewed

Binary file

mindspore/mindspore_np_dtype.dll CHANGED Viewed

Binary file

mindspore/mindspore_ops.dll CHANGED Viewed

Binary file

mindspore/nn/__init__.py CHANGED Viewed

@@ -31,6 +31,7 @@ from mindspore.nn.wrap import *
 from mindspore.nn.grad import Jvp, Vjp
 from mindspore.nn.sparse import *
 from mindspore.nn.reinforcement import *
+from mindspore.nn.utils import *
 __all__ = ["Cell", "GraphCell"]
 __all__.extend(layer.__all__)
@@ -43,5 +44,6 @@ __all__.extend(sparse.__all__)
 __all__.extend(learning_rate_schedule.__all__)
 __all__.extend(dynamic_lr.__all__)
 __all__.extend(reinforcement.__all__)
+__all__.extend(utils.__all__)
 __all__.sort()

mindspore/nn/cell.py CHANGED Viewed

@@ -32,7 +32,8 @@ from mindspore import context
 from mindspore._c_expression import init_pipeline, update_func_graph_hyper_params, Cell_, FuncGraph, MixedPrecisionType
 from mindspore import _checkparam as Validator
 from mindspore.common import dtype as mstype
-from mindspore.common.api import _cell_graph_executor, _pynative_executor, _get_args_for_run, cells_compile_cache, _no_grad
+from mindspore.common.api import _cell_graph_executor, _pynative_executor, _get_args_for_run, cells_compile_cache, \
+    _no_grad
 from mindspore.common.api import _generate_branch_control_input, _convert_python_data, _get_args_for_run_predict
 from mindspore.common.api import _process_dyn_args, _generate_dyn_compile_args
 from mindspore.common.parameter import Parameter, ParameterTuple
@@ -45,6 +46,7 @@ from mindspore._check_jit_forbidden_api import jit_forbidden_register
 from mindspore.common._decorator import deprecated
 from mindspore.common._register_for_recompute import recompute_registry
 class Cell(Cell_):
     """
     The basic building block of neural networks in MindSpore. The model or neural network layer should inherit this
@@ -2582,7 +2584,7 @@ class Cell(Cell_):
         """
         if context.get_context("mode") == context.PYNATIVE_MODE:
             self._recompute_cell = recompute_registry.get()(self.construct)
-            self._recompute()
+            self._add_recompute_flag()
             return
         self._recompute()
         if 'mp_comm_recompute' in kwargs.keys():
@@ -2685,6 +2687,18 @@ class Cell(Cell_):
         if hasattr(network, "_amp_level"):
             self._amp_level = getattr(network, "_amp_level")
+    def _add_recompute_flag(self):
+        """
+        Set pynative cell recomputed.
+        """
+        if not self._has_config_recompute:
+            self._has_config_recompute = True
+        else:
+            logger.info("The recompute interface can be configured only once."
+                        " If the parent cell is configured, the child cell should not be configured")
+        for cell in self.cells():
+            cell._add_recompute_flag()
 class GraphCell(Cell):
     """

mindspore/nn/layer/conv.py CHANGED Viewed

@@ -862,6 +862,9 @@ class Conv3dTranspose(_Conv):
     However, when `stride` > 1, Conv2d maps multiple input shapes to the same output shape. Deconvolutional network
     can refer to `Deconvolutional Networks <https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf>`_.
+    Note:
+        For Atlas A2 training series products, `output_padding` is currently not supported.
     Args:
         in_channels (int): The channel number of the input tensor of the Conv3dTranspose layer.
         out_channels (int): The channel number of the output tensor of the Conv3dTranspose layer.

mindspore/nn/layer/pooling.py CHANGED Viewed

@@ -297,6 +297,9 @@ class MaxPool3d(_PoolNd):
         \max_{l=0, \ldots, d_{ker}-1} \max_{m=0, \ldots, h_{ker}-1} \max_{n=0, \ldots, w_{ker}-1}
         \text{input}(N_i, C_j, s_0 \times d + l, s_1 \times h + m, s_2 \times w + n)
+    .. note::
+        For Atlas training series products, this interface is not supported.
     Args:
         kernel_size (Union[int, tuple[int]]): The size of kernel used to take the maximum value,
             is an int number or a single element tuple that represents depth, height and width of the kernel, or a tuple
@@ -1032,16 +1035,11 @@ class AvgPool2dExt(Cell):
         >>> import numpy as np
         >>> from mindspore import Tensor, nn
         >>> from mindspore import dtype as mstype
-        >>> x = Tensor(np.arange(1 * 3 * 3 * 4).reshape(1, 3, 3, 4), mstype.float32)
-        >>> m =  nn.AvgPool2dExt(x, kernel_size=2, stride=1)
-        >>> output = m(x)
-        >>> print(output)
-        [[[[ 2.5   3.5   4.5]
-           [ 6.5   7.5   8.5]]
-          [[14.5  15.5  16.5]
-           [18.5  19.5  20.5]]
-          [[26.5  27.5  28.5]
-           [30.5  31.5  32.5]]]]
+        >>> input = Tensor(np.arange(1 * 3 * 3 * 4).reshape(1, 3, 3, 4), mstype.float32)
+        >>> net = nn.AvgPool2dExt(kernel_size=2, stride=1)
+        >>> output = net(input)
+        >>> print(output.shape)
+        (1, 3, 2, 3)
     """
     def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False,
                  count_include_pad=True, divisor_override=None):

mindspore/nn/utils/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+nn.utils.
+"""
+from __future__ import absolute_import
+from .init import no_init_parameters
+__all__ = ["no_init_parameters"]

mindspore/nn/utils/init.py ADDED Viewed

@@ -0,0 +1,71 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""init for nn.Cell."""
+from __future__ import absolute_import
+from contextlib import contextmanager
+from mindspore.common.parameter import Parameter
+@contextmanager
+def no_init_parameters():
+    r"""
+     In scenarios where a checkpoint is loaded, parameters within the network instantiation will be
+     instantiated and occupy physical memory. Loading a checkpoint will replace the parameter values.
+     Decorator can be applied during network instantiation to add an attribute `init_param` to all
+     parameters within the current Cell, setting it to `init_param=False` .
+     When `init_param=False` is detected, the initialization of the parameters is skipped,
+     and the parameters are assigned values directly from the checkpoint during loading,
+     which can optimize performance and reduce physical memory usage.
+     Note:
+         Initialization of parameters created with `initializer` can only be skipped.
+         Parameters created by `Tensor` or `numpy` cannot be skipped.
+     Examples:
+        >>> import mindspore as ms
+        >>> from mindspore import nn, ops, load_checkpoint
+        >>> from mindspore.common.initializer import initializer
+        >>> from mindspore.nn.utils import no_init_parameters
+        >>> # 1. Add a decorator to the network that requires delayed initialization
+        >>> class Net(nn.Cell):
+        ...     def __init__(self, in_channels, out_channels):
+        ...         super().__init__()
+        ...         self.weight = ms.Parameter(initializer("normal", [in_channels, out_channels], ms.float32))
+        ...         self.bias = ms.Parameter(initializer("normal", [out_channels], ms.float32))
+        ...         self.matmul = ops.MatMul()
+        ...         self.add = ops.Add()
+        ...
+        ...     def construct(self, x):
+        ...         x = self.matmul(x, self.weight)
+        ...         x = self.add(x, self.bias)
+        ...         return x
+        >>> with no_init_parameters():
+        ...     # After instantiation, all parameters in the net are not initialized
+        ...     net = Net(28*28, 64)
+        >>> # 2. Load checkpoint parameters to the net
+        >>> load_checkpoint('./checkpoint/test_net.ckpt', net=net)
+        >>> # 3. After loading the checkpoint, manually call init_parameters_data() to initialize
+        >>> #    the uninitialized parameters in the net if need. If the network is executed,
+        >>> #    the framework will automatically call this interface.
+        >>> net.init_parameters_data()
+    """
+    init_class = Parameter
+    setattr(init_class, "init_param", False)
+    try:
+        yield
+    finally:
+        setattr(init_class, "init_param", True)

mindspore/ops/_grad_experimental/grad_comm_ops.py CHANGED Viewed

@@ -16,7 +16,7 @@
 """Generate bprop for comm ops"""
 from __future__ import division
 from __future__ import absolute_import
-from mindspore import Tensor
+from mindspore import Tensor, Parameter
 import mindspore.common.dtype as mstype
 from mindspore.ops import functional as F
 from mindspore.communication import get_rank, get_group_size
@@ -37,6 +37,9 @@ from mindspore.ops._grad_experimental.grad_base import bprop_getters
 from mindspore.ops.operations import _grad_ops as G
 import mindspore as ms
+_device_local_norm = None
+if ms.get_auto_parallel_context("dump_device_local_norm"):
+    _device_local_norm = Parameter(Tensor(0.0, mstype.float32), name="_device_local_norm", requires_grad=False)
 @bprop_getters.register(AllReduce)
 def get_bprop_all_reduce(self):
@@ -247,10 +250,15 @@ def get_bprop_mirror_micro_step_operator(self):
     reduce_sum = P.ReduceSum(keep_dims=False)
     square = P.Square()
     dump_local_norm = ms.get_auto_parallel_context("dump_local_norm")
+    dump_device_local_norm = ms.get_auto_parallel_context("dump_device_local_norm")
     def bprop(x, z, out, dout):
-        if dump_local_norm:
-            z = F.depend(z, ln_print("dump local norm: ", param_name, reduce_sum(square((z)))))
+        if dump_local_norm or dump_device_local_norm:
+            _norm = reduce_sum(square((z)))
+            if dump_local_norm:
+                z = F.depend(z, ln_print("dump local norm: ", param_name, _norm))
+            if dump_device_local_norm:
+                z = F.depend(z, F.assign_add(_device_local_norm, cast(_norm, _device_local_norm.dtype)))
         real_grad = z
         assign_out = dout
         if issubclass_(F.typeof(dout), mstype.tensor_type):
@@ -373,6 +381,7 @@ def get_bprop_micro_step_all_gather(self):
     reduce_sum = P.ReduceSum(keep_dims=False)
     square = P.Square()
     dump_local_norm = ms.get_auto_parallel_context("dump_local_norm")
+    dump_device_local_norm = ms.get_auto_parallel_context("dump_device_local_norm")
     def bprop(x, z, out, dout):
         if with_mirror_operator:
@@ -383,8 +392,12 @@ def get_bprop_micro_step_all_gather(self):
                 real_grad = F.tensor_mul(real_grad, scale)
             return (real_grad, cast(out_tensor, dtype(z)))
         z = F.depend(z, dout)
-        if dump_local_norm:
-            z = F.depend(z, ln_print("dump local norm: ", param_name, reduce_sum(square((z)))))
+        if dump_local_norm or dump_device_local_norm:
+            _norm = reduce_sum(square((z)))
+            if dump_local_norm:
+                z = F.depend(z, ln_print("dump local norm: ", param_name, _norm))
+            if dump_device_local_norm:
+                z = F.depend(z, F.assign_add(_device_local_norm, cast(_norm, _device_local_norm.dtype)))
         if not do_mirror:
             return (z, cast(out_tensor, dtype(z)))
         real_grad = reduce_scatter(z)
@@ -586,6 +599,7 @@ def get_bprop_mirror_operator(self):
     dev_num_r = 1.0
     dump_local_norm = ms.get_auto_parallel_context("dump_local_norm")
+    dump_device_local_norm = ms.get_auto_parallel_context("dump_device_local_norm")
     if dev_num > 1:
         dev_num_r = 1.0 / dev_num
         all_reduce = AllReduce(group=group)
@@ -608,8 +622,12 @@ def get_bprop_mirror_operator(self):
             all_reduce.set_prim_instance_name(instance_name)
     def bprop(x, out, dout):
-        if dump_local_norm:
-            dout = F.depend(dout, ln_print("dump local norm: ", param_name, reduce_sum(square((dout)))))
+        if dump_local_norm or dump_device_local_norm:
+            _norm = reduce_sum(square((dout)))
+            if dump_local_norm:
+                dout = F.depend(dout, ln_print("dump local norm: ", param_name, _norm))
+            if dump_device_local_norm:
+                dout = F.depend(dout, F.assign_add(_device_local_norm, cast(_norm, _device_local_norm.dtype)))
         if dev_num == 1:
             return (dout,)

mindspore/ops/auto_generate/gen_ops_prim.py CHANGED Viewed

@@ -2387,7 +2387,8 @@ class BatchMatMul(Primitive):
     \text{output}[..., :, :] = \text{matrix}(x[..., :, :]) * \text{matrix}(y[..., :, :])
-    The rank of both two input tensors must be same and not less than `2`.
+    The rank of the two input tensors must be at least `2`, and the two input tensors must have the same rank
+    if the environment is GPU or CPU.
     Args:
         transpose_a (bool): If ``True`` , the last two dimensions of `x` is transposed before multiplication.
@@ -9488,7 +9489,7 @@ class MatMul(Primitive):
     .. math::
-    Output_{i j}=\sum_{k=1}^{p} a_{i k} b_{k j}=a_{i 1} b_{1 j}+a_{i 2} b_{2 j}+\cdots+a_{i p} b_{p j}, p\in N
+        (Output)_{i j}=\sum_{k=1}^{p} a_{i k} b_{k j}=a_{i 1} b_{1 j}+a_{i 2} b_{2 j}+\cdots+a_{i p} b_{p j}, p\in N
     where the :math:`i,j` indicates the output of the i-th row and j-th column element.

mindspore/ops/function/math_func.py CHANGED Viewed

@@ -9088,9 +9088,9 @@ def remainder(input, other):
     both dtypes cannot be bool, and the shapes of them could be broadcast. When the inputs are one tensor
     and one scalar, the scalar could only be a constant.
-    .. math::
+    .. code:: python
-        remainder(input, other) = input - input.div(other, rounding\_mode="floor") * other
+        remainder(input, other) == input - input.div(other, rounding_mode="floor") * other
     .. warning::
         - When the elements of input exceed 2048, there might be accuracy problems.
@@ -9135,9 +9135,10 @@ def remainder_ext(input, other):
     Supports broadcasting to a common shape and implicit type promotion.
-    .. math::
+    .. code:: python
+        remainder(input, other) == input - input.div(other, rounding_mode="floor") * other
-        remainder(input, other) = input - input.div(other, rounding\_mode="floor") * other
     Note:
         Complex inputs are not supported. At least one input need to be tensor, but not both are bool tensors.

mindspore/ops/operations/comm_ops.py CHANGED Viewed

@@ -988,6 +988,9 @@ class NeighborExchangeV2(Primitive):
         in the same subnet, please check the `details \
         <https://www.mindspore.cn/docs/en/master/api_python/samples/ops/communicate_ops.html#notes>`_.
+        Users need to ensure that the length of the received data `recv_lens` is consistent with that of
+        the sent data `send_lens`.
     Args:
         send_rank_ids (list(int)): Ranks which the data is sent to. 8 rank_ids represents 8 directions, if one
                                    direction is not send to , set it -1.
@@ -1393,7 +1396,7 @@ class Send(PrimitiveWithInfer):
         >>>     def __init__(self):
         >>>         super(SendNet, self).__init__()
         >>>         self.depend = ops.Depend()
-        >>>         self.send = ops.Send(st_tag=0, dest_rank=8, group="hccl_world_group")
+        >>>         self.send = ops.Send(sr_tag=0, dest_rank=8, group="hccl_world_group")
         >>>
         >>>     def construct(self, x):
         >>>         out = self.depend(x, self.send(x))

mindspore/ops/operations/custom_ops.py CHANGED Viewed

@@ -251,11 +251,13 @@ class Custom(ops.PrimitiveWithInfer):
                  - "xxx.so" file generation:
-                   1) GPU Platform: Given user defined "xxx.cu" file (ex. "{path}/add.cu"), use nvcc command to compile
-                   it.(ex. "nvcc --shared -Xcompiler -fPIC -o add.so add.cu")
+                   1) GPU Platform: Given user defined "xxx.cu" file (ex. "{path}/add.cu"),
+                   use nvcc command to compile
+                   it.(ex. :code:`nvcc --shared -Xcompiler -fPIC -o add.so add.cu`)
-                   2) CPU Platform: Given user defined "xxx.cc" file (ex. "{path}/add.cc"), use g++/gcc command to
-                   compile it.(ex. "g++ --shared -fPIC  -o add.so add.cc")
+                   2) CPU Platform: Given user defined "xxx.cc" file (ex. "{path}/add.cc"),
+                   use g++/gcc command to
+                   compile it.(ex. :code:`g++ --shared -fPIC -o add.so add.cc`)
                  - Define a "xxx.cc"/"xxx.cu" file: