PyPI - mindspore - Versions diffs - 2.4.10__cp310-cp310-manylinux1_x86_64.whl → 2.5.0__cp310-cp310-manylinux1_x86_64.whl - Mend

mindspore 2.4.10__cp310-cp310-manylinux1_x86_64.whl → 2.5.0__cp310-cp310-manylinux1_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (705) hide show

mindspore/device_manager.py ADDED Viewed

@@ -0,0 +1,134 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Device manager interfaces."""
+import os
+from mindspore import log as logger
+from mindspore._c_expression import DeviceManagerConf, DeviceContextManager, MSContext, CollectiveManager
+from mindspore._checkparam import args_type_check
+from mindspore.parallel._ps_context import _need_reset_device_target_for_ps
+__all__ = ['set_device', 'set_deterministic']
+@args_type_check(device_target=str, device_id=int)
+def set_device(device_target, device_id=None):
+    """
+    Set device target and device id for running environment.
+    Note:
+        - The `device_target` must be set in the ["CPU", "GPU", "Ascend"], there is no default value.
+        - Suggest setting `device_target` and `device_id` before calling :func:`mindspore.communication.init`.
+    Args:
+        device_target (str): The target device to run, only support "Ascend", "GPU", and "CPU".
+        device_id (int): ID of the target device, the value must be in [0, device_num_per_host-1].
+            The frame will set different default behaviours according to the scenario:
+            if it is a single-card scenario, the frame will be set to 0.
+            In a distributed scenario where msrun is started, the framework will
+            automatically negotiate the available device_id values.
+            In a distributed scenario with other startup methods, the frame is set to 0.
+            "device_num_per_host" refers to the total number of devices on the host.
+    Examples:
+        >>> import mindspore as ms
+        >>> ms.set_device("Ascend", 1)
+    """
+    valid_targets = ["CPU", "GPU", "Ascend"]
+    if device_target not in valid_targets:
+        raise ValueError(f"The argument 'device_target' must be one of {valid_targets}, but got {device_target}.")
+    # If in Parameter Server mode, Ascend card should not be used by server and scheduler.
+    if _need_reset_device_target_for_ps(device_target):
+        logger.info("Reset device target to CPU when set_device.")
+        device_target = "CPU"
+    is_default = False
+    if device_id is None:
+        device_id = 0
+        is_default = True
+    if device_id < 0:
+        raise ValueError("The device id must bigger than or equal to 0.")
+    MSContext.get_instance().set_device_target_inner(device_target)
+    if DeviceManagerConf.get_instance().is_device_enable():
+        old_device_target = DeviceManagerConf.get_instance().get_device_target()
+        old_device_id = DeviceManagerConf.get_instance().get_device_id()
+        if old_device_target != device_target or old_device_id != device_id:
+            raise RuntimeError("The 'mindspore.set_device' can not be modified.")
+        return
+    device_context = DeviceContextManager.get_instance().get_device_context(device_target)
+    if device_context is not None and device_context.initialized():
+        raise RuntimeError("The runtime has been initialized, please set it before the kernel is executed, "
+                           "or before calling 'mindspore.communication.init()'. "
+                           "Suggest setting it as early as possible.")
+    DeviceManagerConf.get_instance().set_device(device_target, device_id, is_default)
+@args_type_check(deterministic=bool)
+def set_deterministic(deterministic):
+    """
+    Enables or disables deterministic computing.
+    When deterministic computing is enabled, the same output is generated if an operator is executed
+    for multiple times with the same hardware and input.This often slows down operator execution.
+    In distributed scenario, we suggest user to set deterministic mode before
+    calling :func:`mindspore.communication.init` to enable deterministic operation for
+    communication operators in the global communication group.
+    The framework not enabled deterministic computation by default.
+    Args:
+        deterministic (bool): Whether to enable deterministic computing.
+    Examples:
+        >>> import mindspore as ms
+        >>> ms.set_deterministic(True)
+    """
+    # Check the configuration environment whether valid.
+    if DeviceManagerConf.get_instance().is_deterministic_configured():
+        raise RuntimeError("The 'mindspore.set_deterministic' can not be set repeatedly.")
+    # Must wait for all async created groups to be initialized so that
+    # deterministic feature could be consistent between all processes.
+    CollectiveManager.get_instance().wait_all_comm_init()
+    # Check the hccl_deterministic and te_parallel_compiler.
+    hccl_deterministic = os.getenv("HCCL_DETERMINISTIC")
+    te_parallel_compiler = os.getenv("TE_PARALLEL_COMPILER")
+    if deterministic:
+        if hccl_deterministic and hccl_deterministic != "true":
+            logger.warning(f"Environment 'HCCL_DETERMINISTIC' should be 'true' when set deterministic='True', but "
+                           f"got '{hccl_deterministic}'. 'HCCL_DETERMINISTIC' will be set to 'true'.")
+        if te_parallel_compiler and te_parallel_compiler != "1":
+            logger.warning(f"Environment 'TE_PARALLEL_COMPILER' should be '1' when set deterministic='True', but "
+                           f"got '{te_parallel_compiler}'. 'TE_PARALLEL_COMPILER' will be set to '1'.")
+        os.environ["HCCL_DETERMINISTIC"] = "true"
+        os.environ["TE_PARALLEL_COMPILER"] = "1"
+    else:
+        if hccl_deterministic and hccl_deterministic != "false":
+            logger.warning(f"Environment 'HCCL_DETERMINISTIC' should not be set or be 'false' when set "
+                           f"deterministic='False', but got '{hccl_deterministic}'. 'HCCL_DETERMINISTIC' "
+                           f"will be unset.")
+            del os.environ["HCCL_DETERMINISTIC"]
+        if te_parallel_compiler and te_parallel_compiler != "0":
+            logger.warning(f"Environment 'TE_PARALLEL_COMPILER' should not be set or be '0' when set "
+                           f"deterministic='False', but got '{te_parallel_compiler}'. 'TE_PARALLEL_COMPILER' "
+                           f"will be unset.")
+            del os.environ["TE_PARALLEL_COMPILER"]
+    DeviceManagerConf.get_instance().set_deterministic(deterministic)

mindspore/experimental/llm_boost/__init__.py CHANGED Viewed

@@ -16,6 +16,7 @@
 from __future__ import absolute_import
 from mindspore.experimental.llm_boost.atb import LlamaBoost, QwenBoost
+from mindspore.experimental.llm_boost.ascend_native import *
 from mindspore.experimental.llm_boost.register import LlmBoostRegister
 __all__ = ["LlmBoostRegister"]

mindspore/experimental/llm_boost/ascend_native/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Provide llm boost for inference, such as LlamaBoost.
+"""
+from __future__ import absolute_import
+from mindspore.experimental.llm_boost.ascend_native.llama_boost_ascend_native import LlamaBoostAscendNative
+__all__ = ['LlamaBoostAscendNative']

mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py ADDED Viewed

@@ -0,0 +1,211 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""AscendNative Llama Boost APIs."""
+import os
+import numpy as np
+from mindspore.common import Tensor, dtype
+from mindspore.experimental.llm_boost.ascend_native.llm_boost import LLMBoost
+from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
+def RoundUp(val: int, align: int) -> int:
+    if align == 0:
+        return 0
+    return -(val // -align) * align
+def ConvertTensor(nd_mat: np.ndarray, transpose: bool = True, nd2nz: bool = True) -> np.ndarray:
+    """ Transforms tensor format from Nd to Nz """
+    if transpose:
+        nd_mat = np.transpose(nd_mat)
+    if not nd2nz:
+        return nd_mat
+    block_size = (16, 16)
+    r = RoundUp(nd_mat.shape[0], block_size[0])
+    c = RoundUp(nd_mat.shape[1], block_size[1])
+    r_pad = r - nd_mat.shape[0]
+    c_pad = c - nd_mat.shape[1]
+    nd_mat = np.pad(nd_mat, ((0, r_pad), (0, c_pad)))
+    nz_mat = np.transpose(np.reshape(
+        nd_mat, (r, c // block_size[1], block_size[1])), (1, 0, 2))
+    nz_mat = nz_mat.reshape(r, c)
+    return nz_mat
+@LlmBoostRegister.register(LlmBoostType.ASCEND_NATIVE, "Llama")
+class LlamaBoostAscendNative(LLMBoost):
+    r"""
+    Implements an Llama model in a single kernel.
+    it forwards the python functions to the C++ binded object
+    """
+    def _get_from_dict(self, dictionary, name):
+        """ internal function to get a specific tensor from the dictionary """
+        all_relevant_layers = [value for key, value in dictionary.items() if name in key]
+        if all_relevant_layers:
+            return all_relevant_layers[0].asnumpy()
+        return None
+    def _get_quant_triplet_from_dict(self, dictionary, name):
+        """ internal function to get a weight triple tensor from the dictionary """
+        weights = self._get_from_dict(dictionary, name + "._handler.weight")
+        scale = self._get_from_dict(dictionary, name + "._weight_quantizer.scale")
+        offset = self._get_from_dict(dictionary, name + "._weight_quantizer.zp_neg")
+        return weights, scale, offset
+    def _prepare_single_layer(self, ckpt, config, id):
+        """ prepares the dictionary of weights of a single layer """
+        prefix = 'model.layers.' + str(id)
+        is_last = (id == config.num_layers-1)
+        layer = 'layers.' + str(id) + '.'
+        l_dict = {key: value for key, value in ckpt.items() if layer in key}
+        if config.n_kv_heads is None:
+            config.n_kv_heads = config.num_heads
+        start = 0
+        end = config.hidden_size
+        kv_start = 0
+        kv_end = int(config.hidden_size*config.n_kv_heads/config.num_heads)
+        ffn_hid = [value for key, value in l_dict.items() if "w3" in key][0].shape[0]
+        ffn_start = 0
+        ffn_end = ffn_hid
+        rank_size = int(os.getenv('RANK_SIZE', '1'))
+        #Emir if (config.parallel_mode != 2): # 2 - AUTO_PARALLEL
+        hid_size = end
+        kv_hid_size = kv_end
+        embed_size = config.vocab_size
+        rank_id = int(os.getenv('RANK_ID', '0'))
+        if (hid_size % rank_size == 0) and (ffn_hid % rank_size == 0) and (embed_size % rank_size == 0):
+            start = int(rank_id * hid_size / rank_size)
+            end = int((rank_id + 1) * hid_size / rank_size)
+            kv_start = int(rank_id * kv_hid_size / rank_size)
+            kv_end = int((rank_id + 1) * kv_hid_size / rank_size)
+            ffn_start = int(rank_id * ffn_hid / rank_size)
+            ffn_end = int((rank_id + 1) * ffn_hid / rank_size)
+        else:
+            raise RuntimeError("hidden size and ffn hidden size must be divided by rank size without remainder.  \
+                                hidden_size: ", hid_size, " ffn_hidden_size: ", ffn_hid, " rank_size: ", rank_size)
+        quant = (self._get_from_dict(l_dict, "_weight_quantizer") is not None)
+        unite_qkv = (config.num_heads == config.n_kv_heads)
+        self.dictionary[prefix + ".attention_norm.weight"] = \
+            Tensor(self._get_from_dict(l_dict, "attention_norm"), dtype=dtype.float16)
+        self.dictionary[prefix + ".ffn_norm.weight"] = \
+            Tensor(self._get_from_dict(l_dict, "ffn_norm"), dtype=dtype.float16)
+        if is_last:
+            self.dictionary['lm_head.weight'] = Tensor(ConvertTensor(ckpt['lm_head.weight'].asnumpy()[:, start:end]))
+        if not quant:
+            self._pack_attn_weights(l_dict, prefix, start, end, kv_start, kv_end, unite_qkv)
+            self._pack_ffn_weights(l_dict, prefix, ffn_start, ffn_end)
+        else:
+            self._pack_attn_quant_weights(l_dict, prefix, start, end, kv_start, kv_end, unite_qkv)
+            self._pack_ffn_quant_weights(l_dict, prefix, ffn_start, ffn_end)
+    def _pack_attn_weights(self, l_dict, prefix, start, end, kv_start, kv_end, unite_qkv):
+        """ prepares the dictionary of weights of an attention block """
+        wq = self._get_from_dict(l_dict, "wq")[start:end, :]
+        wk = self._get_from_dict(l_dict, "wk")[kv_start:kv_end, :]
+        wv = self._get_from_dict(l_dict, "wv")[kv_start:kv_end, :]
+        self.dictionary[prefix + ".attention.wo.weight"] = \
+            Tensor(ConvertTensor(self._get_from_dict(l_dict, "wo")[:, start:end]))
+        if unite_qkv:
+            self.dictionary[prefix + ".attention.wqkv.weight"] = Tensor(ConvertTensor(np.concatenate((wq, wk, wv))))
+        else:
+            self.dictionary[prefix + ".attention.wq.weight"] = Tensor(ConvertTensor(wq))
+            self.dictionary[prefix + ".attention.wkv.weight"] = Tensor(ConvertTensor(np.concatenate((wk, wv))))
+    def _pack_ffn_weights(self, l_dict, prefix, ffn_start, ffn_end):
+        """ prepares the dictionary of weights of an ffn block """
+        self.dictionary[prefix + ".feed_forward.w2.weight"] = \
+            Tensor(ConvertTensor(self._get_from_dict(l_dict, "w2")[:, ffn_start:ffn_end]))
+        w1 = self._get_from_dict(l_dict, "w1")[ffn_start:ffn_end, :]
+        w3 = self._get_from_dict(l_dict, "w3")[ffn_start:ffn_end, :]
+        self.dictionary[prefix + ".feed_forward.w13.weight"] = Tensor(ConvertTensor(np.concatenate((w1, w3))))
+    def _pack_attn_quant_weights(self, l_dict, prefix, start, end, kv_start, kv_end, unite_qkv):
+        """ prepares the dictionary of weights of a quantized attention block """
+        wq, wq_scale, wq_offset = self._get_quant_triplet_from_dict(l_dict, "wq")
+        wk, wk_scale, wk_offset = self._get_quant_triplet_from_dict(l_dict, "wk")
+        wv, wv_scale, wv_offset = self._get_quant_triplet_from_dict(l_dict, "wv")
+        wo, wo_scale, wo_offset = self._get_quant_triplet_from_dict(l_dict, "wo")
+        self.dictionary[prefix + ".attention.wo.weight"] = Tensor(ConvertTensor(wo[:, start:end], nd2nz=False))
+        self.dictionary[prefix + ".attention.wo.weight.scale"] = Tensor(wo_scale[start:end])
+        self.dictionary[prefix + ".attention.wo.weight.offset"] = Tensor(wo_offset[start:end])
+        if unite_qkv:
+            self.dictionary[prefix + ".attention.wqkv.weight"] = \
+             Tensor(ConvertTensor(np.concatenate((wq[start:end, :], wk[kv_start:kv_end, :], wv[kv_start:kv_end, :])),
+                                  nd2nz=False))
+            self.dictionary[prefix + ".attention.wqkv.weight.scale"] = \
+                Tensor(np.concatenate((wq_scale[start:end], wk_scale[kv_start:kv_end], wv_scale[kv_start:kv_end])))
+            self.dictionary[prefix + ".attention.wqkv.weight.offset"] = \
+                Tensor(np.concatenate((wq_offset[start:end], wk_offset[kv_start:kv_end], wv_offset[kv_start:kv_end])))
+        else:
+            self.dictionary[prefix + ".attention.wq.weight"] = Tensor(ConvertTensor(wq[start:end, :], nd2nz=False))
+            self.dictionary[prefix + ".attention.wq.weight.scale"] = Tensor(wq_scale[start:end])
+            self.dictionary[prefix + ".attention.wq.weight.offset"] = Tensor(wq_offset[start:end])
+            self.dictionary[prefix + ".attention.wkv.weight"] = \
+                Tensor(ConvertTensor(np.concatenate((wk[kv_start:kv_end, :], wv[kv_start:kv_end, :])), nd2nz=False))
+            self.dictionary[prefix + ".attention.wkv.weight.scale"] = \
+                Tensor(np.concatenate((wk_scale[kv_start:kv_end], wv_scale[kv_start:kv_end])))
+            self.dictionary[prefix + ".attention.wkv.weight.offset"] = \
+                Tensor(np.concatenate((wk_offset[kv_start:kv_end], wv_offset[kv_start:kv_end])))
+    def _pack_ffn_quant_weights(self, l_dict, prefix, ffn_start, ffn_end):
+        """ prepares the dictionary of weights of a quantized ffn block """
+        w1, w1_scale, w1_offset = self._get_quant_triplet_from_dict(l_dict, "w1")
+        w2, w2_scale, w2_offset = self._get_quant_triplet_from_dict(l_dict, "w2")
+        w3, w3_scale, w3_offset = self._get_quant_triplet_from_dict(l_dict, "w3")
+        self.dictionary[prefix + ".feed_forward.w2.weight"] = Tensor(ConvertTensor(w2[:, ffn_start:ffn_end],
+                                                                                   nd2nz=False))
+        self.dictionary[prefix + ".feed_forward.w2.weight.scale"] = Tensor(w2_scale[ffn_start:ffn_end])
+        self.dictionary[prefix + ".feed_forward.w2.weight.offset"] = Tensor(w2_offset[ffn_start:ffn_end])
+        self.dictionary[prefix + ".feed_forward.w13.weight"] = \
+                Tensor(ConvertTensor(np.concatenate((w1[ffn_start:ffn_end, :], w3[ffn_start:ffn_end, :])), nd2nz=False))
+        self.dictionary[prefix + ".feed_forward.w13.weight.scale"] = \
+                Tensor(np.concatenate((w1_scale[ffn_start:ffn_end], w3_scale[ffn_start:ffn_end])))
+        self.dictionary[prefix + ".feed_forward.w13.weight.offset"] = \
+            Tensor(np.concatenate((w1_offset[ffn_start:ffn_end], w3_offset[ffn_start:ffn_end])))
+    def _prepare_cos_sin_arrays(self, config, theta=10000):
+        """ prepares the cosine and sine arrays """
+        head_dim = config.hidden_size // config.num_heads
+        max_position_embedding = \
+            config.max_position_embedding if config.max_position_embedding is not None else config.seq_length
+        freqs_base = np.arange(0, head_dim, 2)[: (head_dim // 2)].astype(np.float32)
+        freqs = 1.0 / (theta ** (freqs_base / head_dim))
+        t = np.arange(0, max_position_embedding, 1).astype(np.float32)
+        freqs = np.outer(t, freqs)
+        emb = np.concatenate((freqs, freqs), axis=-1)
+        freqs_cos = Tensor(np.cos(emb), dtype=dtype.float16)
+        sin = np.sin(emb)
+        sin[:, :int(emb.shape[1]/2)] = -sin[:, :int(emb.shape[1]/2)]
+        self.dictionary['model.cos.weight'] = freqs_cos
+        freqs_sin = Tensor(sin, dtype=dtype.float16)
+        self.dictionary['model.sin.weight'] = freqs_sin
+    def set_weights(self, ckpt_dict):
+        """ load the checkpoint """
+        self.dictionary = {}
+        self.dictionary['model.tok_embeddings.embedding_weight'] = \
+            Tensor(ckpt_dict['model.tok_embeddings.embedding_weight'].asnumpy())
+        self.dictionary['model.norm_out.weight'] = \
+            Tensor(ckpt_dict['model.norm_out.weight'].asnumpy(), dtype=dtype.float16)
+        self._prepare_cos_sin_arrays(self.config)
+        for layer_id in range(self.config.num_layers):
+            self._prepare_single_layer(ckpt_dict, self.config, layer_id)
+        self.binder.set_weights_map(self.dictionary)

mindspore/experimental/llm_boost/ascend_native/llm_boost.py ADDED Viewed

@@ -0,0 +1,52 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""LLMBoost APIs."""
+from mindspore.common import Tensor
+class LLMBoost():
+    r"""
+    Implements an LLM in a single kernel.
+    it forwards the python function to the C++ binded object
+    """
+    def __init__(self, config):
+        r"""
+        initialize the parameters of the llm binder.
+        config is simply the config object of the model
+        """
+        from mindspore._c_expression import LlmBoostBinder
+        self.config = config
+        self.binder = LlmBoostBinder("AscendNative", config.model_type)
+        self.binder.init_model(config.to_dict())
+    def init(self):
+        """
+        Initialize the object
+        returns True if object needs input manipulation by mindformers
+        """
+        return False
+    def set_kvcache(self, k_caches=None, v_caches=None):
+        return
+    def forward(self, input_ids, batch_valid_length, position_ids=None):
+        ret = self.binder.forward([input_ids, batch_valid_length], "nothing really")
+        return Tensor(ret[0])
+    def set_weights(self, ckpt_dict):
+        self.binder.set_weights_map(ckpt_dict)
+    def add_flags(self, is_first_iteration=False):
+        self.binder.add_flags(is_first_iteration=is_first_iteration)

mindspore/experimental/llm_boost/atb/boost_base.py CHANGED Viewed

@@ -112,8 +112,7 @@ class AtbBoostBase:
     def _convert_qkv_concat_weight(self, param_dict):
         """convert qkv concat weight"""
-        assume_num_layers = 500
-        for i in range(assume_num_layers):
+        for i in range(self.num_layers):
             # qkv weight concat
             wq_weight_name = f"model.layers.{i}.attention.wq.weight"
             wk_weight_name = f"model.layers.{i}.attention.wk.weight"
@@ -151,7 +150,7 @@ class AtbBoostBase:
             logger.info(f"transform: {qkv_concat_weight_name}")
             logger.info(f"transform: {gate_hidden_concat_weight_name}")
-        for i in range(assume_num_layers):
+        for i in range(self.num_layers):
             # qkv bias concat
             wq_bias_name = f"model.layers.{i}.attention.wq.bias"
             wk_bias_name = f"model.layers.{i}.attention.wk.bias"

mindspore/experimental/llm_boost/atb/llama_boost.py CHANGED Viewed

@@ -43,7 +43,11 @@ class LlamaBoost(AtbBoostBase):
         )
     def init(self):
-        """set param"""
+        """
+        Initialize the object
+        returns True if object needs input manipulation by mindformers
+        """
         coder_param = {
             "normEps": self.config.rms_norm_eps,
             "normType": NormType.RMS_NORM,
@@ -93,6 +97,7 @@ class LlamaBoost(AtbBoostBase):
         }
         self.atb_encoder_operation.init(json.dumps({**encoder_param}))
         self.atb_decoder_operation.init(json.dumps({**decoder_param}))
+        return True
     def _prepare_inputs(
             self,

mindspore/experimental/llm_boost/register.py CHANGED Viewed

@@ -23,6 +23,7 @@ class LlmBoostType:
         pass
     BUILDIN = 'BuildIn'
+    ASCEND_NATIVE = 'LLMBoost'
 class LlmBoostRegister:

mindspore/experimental/optim/adadelta.py CHANGED Viewed

@@ -37,28 +37,32 @@ class Adadelta(Optimizer):
     Implements Adadelta algorithm.
     .. math::
-       \begin{aligned}
-            &\rule{150mm}{0.4pt}                                                                 \\
-            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)},
-                \: f(\theta) \text{ (objective)}, \: \rho \text{ (decay)},
-                \: \lambda \text{ (weight decay)}                                                \\
-            &\textbf{initialize} :  v_0  \leftarrow 0 \: \text{ (square avg)},
-                \: u_0 \leftarrow 0 \: \text{ (accumulate variables)}                     \\[-1.ex]
-            &\rule{110mm}{0.4pt}                                                                 \\
-            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
-            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
-            &\hspace{5mm}if \: \lambda \neq 0                                                    \\
-            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
-            &\hspace{5mm} v_t      \leftarrow v_{t-1} \rho + g^2_t (1 - \rho)                    \\
-            &\hspace{5mm}\Delta x_t    \leftarrow   \frac{\sqrt{u_{t-1} +
-                \epsilon }}{ \sqrt{v_t + \epsilon}  }g_t \hspace{21mm}                           \\
-            &\hspace{5mm} u_t  \leftarrow   u_{t-1}  \rho +
-                 \Delta x^2_t  (1 - \rho)                                                        \\
-            &\hspace{5mm}\theta_t      \leftarrow   \theta_{t-1} - \gamma  \Delta x_t            \\
-            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
-            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
-            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
-       \end{aligned}
+        \newcommand{\grad}[2]{\nabla_{#1} f_{#2}(#2_{#2 - 1})}
+        \newcommand{\updateVar}[3]{#1_{#2} \leftarrow #1_{#2 - 1} \rho + #3_{#2} (1 - \rho)}
+        \begin{align*}
+            &\rule{150mm}{0.4pt} \\
+            &\textbf{Input}:
+                \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)},
+                \: \rho \text{ (decay)}, \: \lambda \text{ (weight decay)} \\
+            &\textbf{Initialize}:
+                \begin{cases}
+                    v_0 \leftarrow 0 \text{ (square avg)} \\
+                    u_0 \leftarrow 0 \text{ (accumulate variables)}
+                \end{cases} \\
+            &\rule{110mm}{0.4pt} \\
+            &\textbf{For } t = 1 \text{ to } \ldots \text{ do}: \\
+            &\quad g_t \leftarrow \grad{\theta}{t} \\
+            &\quad \text{If } \lambda \neq 0: \\
+            &\quad\quad g_t \leftarrow g_t + \lambda \theta_{t - 1} \\
+            &\quad v_t \leftarrow \updateVar{v}{t}{g^2} \\
+            &\quad \Delta x_t \leftarrow \frac{\sqrt{u_{t - 1} + \epsilon}}{\sqrt{v_t + \epsilon}} g_t \\
+            &\quad u_t \leftarrow \updateVar{u}{t}{\Delta x^2} \\
+            &\quad \theta_t \leftarrow \theta_{t - 1} - \gamma \Delta x_t \\
+            &\rule{110mm}{0.4pt} \\
+            &\bf{Return}: \theta_t \\
+            &\rule{110mm}{0.4pt}
+        \end{align*}
     .. warning::
         This is an experimental optimizer API that is subject to change.

mindspore/experimental/optim/adam.py CHANGED Viewed

@@ -78,6 +78,9 @@ class Adam(Optimizer):
        \end{aligned}
     .. warning::
+        The implementation formula of this optimizer interface is not completely consistent with that in the paper.
+        If you want to use an interface that is completely consistent, it is recommended to use
+        :class:`mindspore.mint.optim.Adam`, which currently only supports Ascend.
         This is an experimental optimizer API that is subject to change.
         This module must be used with lr scheduler module in `LRScheduler Class
         <https://www.mindspore.cn/docs/en/master/api_python/mindspore.nn.html#learningrateschedule-class>`_ .