PyPI - mindspore - Versions diffs - 2.4.0__cp39-none-any.whl → 2.4.10__cp39-none-any.whl - Mend

mindspore 2.4.0cp39-none-any.whl → 2.4.10cp39-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (306) hide show

mindspore/experimental/llm_boost/atb/boost_base.py CHANGED Viewed

@@ -13,17 +13,32 @@
 # limitations under the License.
 # ============================================================================
 """boost base class"""
+from enum import Enum
 import numpy as np
 import mindspore as ms
 from mindspore import ops, Tensor
+from mindspore import log as logger
 from mindspore.ops import operations as P
 import mindspore.common.dtype as mstype
 from mindspore._c_expression import _set_format
 from mindspore.common.parameter import Parameter
 from mindspore.experimental.llm_boost.utils import get_real_rank, get_real_group_size
 from mindspore.common.initializer import Zero
+FORMAT_NZ = "FRACTAL_NZ"
+BUILDIN_BACKEND_NAME = "ATB"
+class PositionEmbeddingType(int, Enum):
+    ROPE = 0
+    ALIBI = 1
+    ABSOLUTE = 2
+class NormType(int, Enum):
+    RMS_NORM = 0
+    LAYER_NORM = 1
 class AttentionMask:
     """attention mask"""
@@ -31,30 +46,34 @@ class AttentionMask:
     @classmethod
     def static(cls, max_seq_len, dtype=mstype.float16, need_nz=False):
         """cache mask"""
-        bias_cache = Tensor(np.tril(np.ones((max_seq_len, max_seq_len), dtype=np.bool_))).reshape(max_seq_len,
-                                                                                                  max_seq_len)
+        bias_cache = Tensor(
+            np.tril(np.ones((max_seq_len, max_seq_len), dtype=np.bool_))
+        ).reshape(max_seq_len, max_seq_len)
         bias_cache = ~bias_cache
         if dtype == mstype.float16:
             mask_value = Tensor(np.finfo(np.float32).min, mstype.float16)
         else:
             mask_value = Tensor(1)
-        attn_mask = ops.masked_fill(Tensor(np.zeros(
-            (max_seq_len, max_seq_len)), dtype=mstype.float16), bias_cache, mask_value)
+        attn_mask = ops.masked_fill(
+            Tensor(np.zeros((max_seq_len, max_seq_len)), dtype=mstype.float16),
+            bias_cache,
+            mask_value,
+        )
         if need_nz:
             # ND -> NZ
             attn_mask = ops.reshape(attn_mask, (1, max_seq_len, max_seq_len))
-            attn_mask = ops.reshape(
-                attn_mask, (1, max_seq_len, max_seq_len // 16, 16))
+            attn_mask = ops.reshape(attn_mask, (1, max_seq_len, max_seq_len // 16, 16))
             attn_mask = ops.transpose(attn_mask, (0, 2, 1, 3)).contiguous()
-            attn_mask = _set_format(attn_mask, "FRACTAL_NZ")
+            attn_mask = _set_format(attn_mask, FORMAT_NZ)
         return attn_mask
-class AtbBoostBase():
+class AtbBoostBase:
     """atb boost base class"""
     def __init__(self, config):
         super().__init__()
+        self.backend_name = BUILDIN_BACKEND_NAME
         self.is_first_iteration = False
         self.config = config
         self.dtype = config.compute_dtype
@@ -68,27 +87,98 @@ class AtbBoostBase():
             self.need_nz = config.need_nz
         self.placeholder = Tensor(np.zeros(1), dtype=self.dtype)
         self.lm_head_indices_fake = Tensor([0], dtype=mstype.int64)
-        self.position_embedding_type = "ROPE"
+        self.position_embedding_type = PositionEmbeddingType.ROPE
         self.add_norm_enable = True
         self.max_decode_length = self.config.max_decode_length
         self.max_base_len = 128
         self.attn_mask = AttentionMask.static(
-            self.max_base_len, dtype=self.dtype, need_nz=self.need_nz)
+            self.max_base_len, dtype=self.dtype, need_nz=self.need_nz
+        )
         self.cast = P.Cast()
         self.reshape = P.Reshape()
         self.kv_quant = None
         self.rank_id = get_real_rank()
         self.device_num = get_real_group_size()
+        self.ascend_weight = []
+        self.k_caches = []
+        self.v_caches = []
     def _convert_tensor_format_and_dtype(self, tensor, dtype=mstype.float16):
         tensor = self.cast(tensor, dtype=dtype)
         if self.need_nz:
-            tensor = _set_format(tensor, "FRACTAL_NZ")
+            tensor = _set_format(tensor, FORMAT_NZ)
         return tensor
+    def _convert_qkv_concat_weight(self, param_dict):
+        """convert qkv concat weight"""
+        assume_num_layers = 500
+        for i in range(assume_num_layers):
+            # qkv weight concat
+            wq_weight_name = f"model.layers.{i}.attention.wq.weight"
+            wk_weight_name = f"model.layers.{i}.attention.wk.weight"
+            wv_weight_name = f"model.layers.{i}.attention.wv.weight"
+            qkv_concat_weight_name = f"model.layers.{i}.attention.w_qkv.weight"
+            if wq_weight_name not in param_dict:
+                break
+            wq_weight = param_dict[wq_weight_name].asnumpy()
+            wk_weight = param_dict[wk_weight_name].asnumpy()
+            wv_weight = param_dict[wv_weight_name].asnumpy()
+            qkv_weight = np.concatenate((wq_weight, wk_weight, wv_weight), 0)
+            param_dict[qkv_concat_weight_name] = Parameter(
+                qkv_weight, name=qkv_concat_weight_name
+            )
+            # gate hidden weight concat
+            ffn_gate_weight_name = f"model.layers.{i}.feed_forward.w1.weight"
+            ffn_hidden_weight_name = f"model.layers.{i}.feed_forward.w3.weight"
+            gate_hidden_concat_weight_name = (
+                f"model.layers.{i}.feed_forward.w_gate_hidden.weight"
+            )
+            ffn_gate_weight = param_dict[ffn_gate_weight_name].asnumpy()
+            ffn_hidden_weight = param_dict[ffn_hidden_weight_name].asnumpy()
+            gate_hidden_weight = np.concatenate((ffn_gate_weight, ffn_hidden_weight), 0)
+            param_dict[gate_hidden_concat_weight_name] = Parameter(
+                gate_hidden_weight, name=gate_hidden_concat_weight_name
+            )
+            param_dict.pop(wq_weight_name)
+            param_dict.pop(wk_weight_name)
+            param_dict.pop(wv_weight_name)
+            param_dict.pop(ffn_gate_weight_name)
+            param_dict.pop(ffn_hidden_weight_name)
+            logger.info(f"transform: {qkv_concat_weight_name}")
+            logger.info(f"transform: {gate_hidden_concat_weight_name}")
+        for i in range(assume_num_layers):
+            # qkv bias concat
+            wq_bias_name = f"model.layers.{i}.attention.wq.bias"
+            wk_bias_name = f"model.layers.{i}.attention.wk.bias"
+            wv_bias_name = f"model.layers.{i}.attention.wv.bias"
+            qkv_concat_bias_name = f"model.layers.{i}.attention.w_qkv.bias"
+            if wq_bias_name not in param_dict:
+                break
+            wq_bias_weight = param_dict[wq_bias_name].asnumpy()
+            wk_bias_weight = param_dict[wk_bias_name].asnumpy()
+            wv_bias_weight = param_dict[wv_bias_name].asnumpy()
+            qkv_bias_weight = np.concatenate(
+                (wq_bias_weight, wk_bias_weight, wv_bias_weight), 0
+            )
+            param_dict[qkv_concat_bias_name] = Parameter(
+                qkv_bias_weight, name=qkv_concat_bias_name
+            )
+            param_dict.pop(wq_bias_name)
+            param_dict.pop(wk_bias_name)
+            param_dict.pop(wv_bias_name)
+            logger.info(f"transform: {qkv_concat_bias_name}")
+        return param_dict
     def set_weights(self, parm_dict, dtype=mstype.float16):
         """set weights for llm boost"""
+        self._convert_qkv_concat_weight(parm_dict)
         embedding_weight_name = "model.tok_embeddings.embedding_weight"
         attention_norm_name = "attention_norm"
         qkv_name = "attention.w_qkv"
@@ -101,45 +191,88 @@ class AtbBoostBase():
         placeholder = Parameter(Tensor(np.zeros(1), dtype=dtype))
         ascend_weight = []
-        ascend_weight.append(
-            self.cast(parm_dict[embedding_weight_name], dtype))
+        ascend_weight.append(self.cast(parm_dict[embedding_weight_name], dtype))
         for i in range(self.num_layers):
-            ascend_weight.append(self._convert_tensor_format_and_dtype(
-                parm_dict[f"model.layers.{i}.{attention_norm_name}.weight"], dtype))
+            ascend_weight.append(
+                self._convert_tensor_format_and_dtype(
+                    parm_dict[f"model.layers.{i}.{attention_norm_name}.weight"], dtype
+                )
+            )
             ascend_weight.extend([placeholder] * 3)
             ascend_weight.append(
-                self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{qkv_name}.weight"], dtype))
-            ascend_weight.append(self._convert_tensor_format_and_dtype(parm_dict.get(
-                f"model.layers.{i}.{qkv_name}.bias", placeholder), dtype))
+                self._convert_tensor_format_and_dtype(
+                    parm_dict[f"model.layers.{i}.{qkv_name}.weight"], dtype
+                )
+            )
+            ascend_weight.append(
+                self._convert_tensor_format_and_dtype(
+                    parm_dict.get(f"model.layers.{i}.{qkv_name}.bias", placeholder),
+                    dtype,
+                )
+            )
             ascend_weight.extend([placeholder] * 16)
             ascend_weight.append(
-                self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{o_name}.weight"], dtype))
-            ascend_weight.append(self._convert_tensor_format_and_dtype(parm_dict.get(
-                f"model.layers.{i}.{o_name}.bias", placeholder), dtype))
+                self._convert_tensor_format_and_dtype(
+                    parm_dict[f"model.layers.{i}.{o_name}.weight"], dtype
+                )
+            )
+            ascend_weight.append(
+                self._convert_tensor_format_and_dtype(
+                    parm_dict.get(f"model.layers.{i}.{o_name}.bias", placeholder), dtype
+                )
+            )
             ascend_weight.extend([placeholder] * 4)
             ascend_weight.append(
-                self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{mlp_norm_name}.weight"], dtype))
+                self._convert_tensor_format_and_dtype(
+                    parm_dict[f"model.layers.{i}.{mlp_norm_name}.weight"], dtype
+                )
+            )
             ascend_weight.extend([placeholder] * 3)
             ascend_weight.append(
-                self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{mlp_gate_name}.weight"], dtype))
-            ascend_weight.append(self._convert_tensor_format_and_dtype(parm_dict.get(
-                f"model.layers.{i}.{mlp_gate_name}.bias", placeholder), dtype))
+                self._convert_tensor_format_and_dtype(
+                    parm_dict[f"model.layers.{i}.{mlp_gate_name}.weight"], dtype
+                )
+            )
+            ascend_weight.append(
+                self._convert_tensor_format_and_dtype(
+                    parm_dict.get(
+                        f"model.layers.{i}.{mlp_gate_name}.bias", placeholder
+                    ),
+                    dtype,
+                )
+            )
             ascend_weight.extend([placeholder] * 10)
             ascend_weight.append(
-                self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{mlp_down_name}.weight"], dtype))
-            ascend_weight.append(self._convert_tensor_format_and_dtype(parm_dict.get(
-                f"model.layers.{i}.{mlp_down_name}.bias", placeholder), dtype))
+                self._convert_tensor_format_and_dtype(
+                    parm_dict[f"model.layers.{i}.{mlp_down_name}.weight"], dtype
+                )
+            )
+            ascend_weight.append(
+                self._convert_tensor_format_and_dtype(
+                    parm_dict.get(
+                        f"model.layers.{i}.{mlp_down_name}.bias", placeholder
+                    ),
+                    dtype,
+                )
+            )
             ascend_weight.extend([placeholder] * 4)
         ascend_weight.append(
-            self._convert_tensor_format_and_dtype(parm_dict[f"{norm_out_name}.weight"], dtype))
+            self._convert_tensor_format_and_dtype(
+                parm_dict[f"{norm_out_name}.weight"], dtype
+            )
+        )
         ascend_weight.append(
-            self._convert_tensor_format_and_dtype(parm_dict[f"{lm_head_name}.weight"], dtype))
+            self._convert_tensor_format_and_dtype(
+                parm_dict[f"{lm_head_name}.weight"], dtype
+            )
+        )
+        self.ascend_weight = ascend_weight
         self.atb_encoder_operation.set_weights(ascend_weight)
         self.atb_decoder_operation.set_weights(ascend_weight)
@@ -147,20 +280,47 @@ class AtbBoostBase():
         """set kv_cache for llm boost"""
         if not k_caches or v_caches:
             if self.need_nz:
-                kv_shape = (self.config.num_blocks, self.num_kv_heads*self.head_dim //
-                            self.device_num // 16, self.config.block_size, 16)
-                k_caches = [_set_format(Parameter(Tensor(
-                    shape=kv_shape, dtype=self.dtype, init=Zero())), "FRACTAL_NZ") for _ in range(self.num_layers)]
-                v_caches = [_set_format(Parameter(Tensor(
-                    shape=kv_shape, dtype=self.dtype, init=Zero())), "FRACTAL_NZ") for _ in range(self.num_layers)]
+                kv_shape = (
+                    self.config.num_blocks,
+                    self.num_kv_heads * self.head_dim // self.device_num // 16,
+                    self.config.block_size,
+                    16,
+                )
+                k_caches = [
+                    _set_format(
+                        Parameter(
+                            Tensor(shape=kv_shape, dtype=self.dtype, init=Zero())
+                        ),
+                        FORMAT_NZ,
+                    )
+                    for _ in range(self.num_layers)
+                ]
+                v_caches = [
+                    _set_format(
+                        Parameter(
+                            Tensor(shape=kv_shape, dtype=self.dtype, init=Zero())
+                        ),
+                        FORMAT_NZ,
+                    )
+                    for _ in range(self.num_layers)
+                ]
             else:
-                kv_shape = (self.config.num_blocks, self.config.block_size,
-                            self.num_kv_heads // self.device_num, self.head_dim)
-                k_caches = [Parameter(Tensor(
-                    shape=kv_shape, dtype=self.dtype, init=Zero())) for _ in range(self.num_layers)]
-                v_caches = [Parameter(Tensor(
-                    shape=kv_shape, dtype=self.dtype, init=Zero())) for _ in range(self.num_layers)]
+                kv_shape = (
+                    self.config.num_blocks,
+                    self.config.block_size,
+                    self.num_kv_heads // self.device_num,
+                    self.head_dim,
+                )
+                k_caches = [
+                    Parameter(Tensor(shape=kv_shape, dtype=self.dtype, init=Zero()))
+                    for _ in range(self.num_layers)
+                ]
+                v_caches = [
+                    Parameter(Tensor(shape=kv_shape, dtype=self.dtype, init=Zero()))
+                    for _ in range(self.num_layers)
+                ]
+        self.k_caches = k_caches
+        self.v_caches = v_caches
         self.atb_encoder_operation.set_kvcache(k_caches, v_caches)
         self.atb_decoder_operation.set_kvcache(k_caches, v_caches)
@@ -171,11 +331,9 @@ class AtbBoostBase():
     def _execute_operator(self, acl_inputs, acl_param):
         """execute operator."""
         if self.is_first_iteration:
-            acl_model_out = self.atb_encoder_operation.forward(
-                acl_inputs, acl_param)
+            acl_model_out = self.atb_encoder_operation.forward(acl_inputs, acl_param)
         else:
-            acl_model_out = self.atb_decoder_operation.forward(
-                acl_inputs, acl_param)
+            acl_model_out = self.atb_decoder_operation.forward(acl_inputs, acl_param)
         acl_hidden_state = acl_model_out[0]
         return acl_hidden_state
@@ -183,28 +341,46 @@ class AtbBoostBase():
         r"""
         LlmBoost forward.
         """
-        input_ids = boost_inputs["input_ids"]
-        position_ids = boost_inputs["position_ids"]
-        cos_embed = boost_inputs["cos_embed"]
-        sin_embed = boost_inputs["sin_embed"]
-        block_tables = boost_inputs["block_tables"]
-        slot_mapping = boost_inputs["slot_mapping"]
-        batch_valid_length = boost_inputs["batch_valid_length"]
-        lm_head_indices = boost_inputs["lm_head_indices"]
-        seqLen = boost_inputs["seq_lens"]
+        input_ids = boost_inputs.get("input_ids", None)
+        position_ids = boost_inputs.get("position_ids", None)
+        cos_embed = boost_inputs.get("cos_embed", None)
+        sin_embed = boost_inputs.get("sin_embed", None)
+        block_tables = boost_inputs.get("block_tables", None)
+        slot_mapping = boost_inputs.get("slot_mapping", None)
+        batch_valid_length = boost_inputs.get("batch_valid_length", None)
+        lm_head_indices = boost_inputs.get("lm_head_indices", None)
+        seqLen = boost_inputs.get("seq_lens", None)
+        input_ids = self.reshape(input_ids, (-1,))
         if self.is_first_iteration:
             attention_mask = self.attn_mask
         else:
-            position_ids = batch_valid_length - 1
+            if position_ids is None:
+                position_ids = batch_valid_length - 1
             attention_mask = self.placeholder
             lm_head_indices = self.lm_head_indices_fake
-        acl_inputs, acl_param = self._prepare_inputs(prefill=self.is_first_iteration, input_ids=input_ids,
-                                                     position_ids=position_ids, cos_embed=cos_embed,
-                                                     sin_embed=sin_embed, attention_mask=attention_mask,
-                                                     block_tables=block_tables, slots=slot_mapping,
-                                                     input_lengths=batch_valid_length, lm_head_indices=lm_head_indices,
-                                                     seqLen=seqLen)
+        if input_ids is not None and input_ids.dtype != mstype.int64:
+            input_ids = self.cast(input_ids, mstype.int64)
+        if position_ids is not None and position_ids.dtype != mstype.int64:
+            position_ids = self.cast(position_ids, mstype.int64)
+        if batch_valid_length is not None and batch_valid_length.dtype != mstype.int32:
+            batch_valid_length = self.cast(batch_valid_length, mstype.int32)
+        if lm_head_indices is not None and lm_head_indices.dtype != mstype.int64:
+            lm_head_indices = self.cast(lm_head_indices, mstype.int64)
+        acl_inputs, acl_param = self._prepare_inputs(
+            prefill=self.is_first_iteration,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cos_embed=cos_embed,
+            sin_embed=sin_embed,
+            attention_mask=attention_mask,
+            block_tables=block_tables,
+            slots=slot_mapping,
+            input_lengths=batch_valid_length,
+            lm_head_indices=lm_head_indices,
+            seqLen=seqLen,
+        )
         ms.hal.synchronize()
         logits = self._execute_operator(acl_inputs, acl_param)
         logits = self.cast(logits, mstype.float32)

mindspore/experimental/llm_boost/atb/llama_boost.py CHANGED Viewed

@@ -15,10 +15,16 @@
 """llm boost"""
 import json
 import mindspore.common.dtype as mstype
-from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase
+from mindspore.experimental.llm_boost.atb.boost_base import (
+    AtbBoostBase,
+    PositionEmbeddingType,
+    NormType,
+)
 from mindspore._c_expression import LlmBoostBinder
 from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
+CPP_LLAMA_MODEL_CLASS_NAME = "llama_LlamaDecoderModel"
 @LlmBoostRegister.register(LlmBoostType.BUILDIN, "Llama")
 class LlamaBoost(AtbBoostBase):
@@ -30,14 +36,17 @@ class LlamaBoost(AtbBoostBase):
         self.acl_encoder_operation_inputs = [None] * self.in_tensor_length
         self.acl_decoder_operation_inputs = [None] * self.in_tensor_length
         self.atb_encoder_operation = LlmBoostBinder(
-            "ATB", "llama_parallel_DecoderModel")
+            self.backend_name, CPP_LLAMA_MODEL_CLASS_NAME
+        )
         self.atb_decoder_operation = LlmBoostBinder(
-            "ATB", "llama_parallel_DecoderModel")
+            self.backend_name, CPP_LLAMA_MODEL_CLASS_NAME
+        )
     def init(self):
         """set param"""
         coder_param = {
-            "rmsNormEps": self.config.rms_norm_eps,
+            "normEps": self.config.rms_norm_eps,
+            "normType": NormType.RMS_NORM,
             "numAttentionHeadsPerRank": self.config.num_heads // self.device_num,
             "hiddenSizePerAttentionHead": self.head_dim,
             "numHiddenLayers": self.num_layers,
@@ -46,32 +55,41 @@ class LlamaBoost(AtbBoostBase):
             "isFA": False,
             "isBF16": self.dtype == mstype.bfloat16,
             "packQuantType": [[1, 1] for _ in range(self.num_layers)],
-            "linearQuantType": [[0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)],
-            "linearTransposeType": [[1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)],
+            "linearQuantType": [
+                [0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)
+            ],
+            "linearTransposeType": [
+                [1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)
+            ],
             "isEmbeddingParallel": False,
             "isLmHeadParallel": not self.config.parallel_config.vocab_emb_dp,
             "lmHeadTransposeType": 1,
-            "supportSwiGLU": True,
-            "kvQuant": self.kv_quant is not None,
+            "enableSwiGLU": True,
+            "enablekvQuant": self.kv_quant is not None,
             "rank": self.rank_id,
             "worldSize": self.device_num,
-            "backend": "lccl",
+            "backend": self.config.communication_backend,
             "rankTableFile": "",
-            "positionEmbeddingType": self.position_embedding_type,
+            "positionEmbeddingType": PositionEmbeddingType.ROPE,
             "hiddenSize": self.config.hidden_size,
             "gemma": False,
-            "enableAddNorm": True,
-            "supportCompressHead": False,
+            "enableAddNorm": False,
+            "enableCompressHead": False,
+            "isUnpadInputs": True,
         }
         encoder_param = {
-            **coder_param, "isPrefill": True,
-            "supportLcoc": True,
-            "supportSpeculate": False,
-            "skipWordEmbedding": False
+            **coder_param,
+            "isPrefill": True,
+            "enableLcoc": True,
+            "enableSpeculate": False,
+            "skipWordEmbedding": False,
+            "enableSplitFuse": False,
         }
         decoder_param = {
-            **coder_param, "isPrefill": False, "supportLcoc": False,
-            "supportSpeculate": False
+            **coder_param,
+            "isPrefill": False,
+            "enableLcoc": False,
+            "enableSpeculate": False,
         }
         self.atb_encoder_operation.init(json.dumps({**encoder_param}))
         self.atb_decoder_operation.init(json.dumps({**decoder_param}))
@@ -92,14 +110,15 @@ class LlamaBoost(AtbBoostBase):
             **kwargs
     ):
         """prepare inputs"""
-        self.acl_param = json.dumps({
-            "seqLen": seqLen,
-        })
-        self.acl_decoder_operation_inputs[0] = self.cast(
-            input_ids, mstype.int64)
+        self.acl_param = json.dumps(
+            {
+                "seqLen": seqLen,
+            }
+        )
+        self.acl_decoder_operation_inputs[0] = input_ids
         self.acl_decoder_operation_inputs[1] = self.placeholder
-        self.acl_decoder_operation_inputs[2] = self.cast(
-            position_ids, mstype.int32)
+        self.acl_decoder_operation_inputs[2] = position_ids
         self.acl_decoder_operation_inputs[3] = cos_embed
         self.acl_decoder_operation_inputs[4] = sin_embed
         self.acl_decoder_operation_inputs[5] = attention_mask
@@ -108,8 +127,6 @@ class LlamaBoost(AtbBoostBase):
         self.acl_decoder_operation_inputs[8] = self.placeholder
         self.acl_decoder_operation_inputs[9] = self.placeholder
         self.acl_decoder_operation_inputs[10] = self.placeholder
-        self.acl_decoder_operation_inputs[11] = self.cast(
-            input_lengths, mstype.int32)
-        self.acl_decoder_operation_inputs[12] = self.cast(
-            lm_head_indices, mstype.int64)
+        self.acl_decoder_operation_inputs[11] = input_lengths
+        self.acl_decoder_operation_inputs[12] = lm_head_indices
         return self.acl_decoder_operation_inputs, self.acl_param

mindspore/experimental/llm_boost/atb/qwen_boost.py CHANGED Viewed

@@ -15,11 +15,14 @@
 """llm boost"""
 import json
 import mindspore.common.dtype as mstype
-from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase
+from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase, NormType
 from mindspore._c_expression import LlmBoostBinder
 from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
+CPP_QWEN_MODEL_CLASS_NAME = "qwen_QwenDecoderModel"
 @LlmBoostRegister.register(LlmBoostType.BUILDIN, "Qwen")
 class QwenBoost(AtbBoostBase):
     """QwenBoost class"""
@@ -30,9 +33,11 @@ class QwenBoost(AtbBoostBase):
         self.acl_encoder_operation_inputs = [None] * self.in_tensor_length
         self.acl_decoder_operation_inputs = [None] * self.in_tensor_length
         self.atb_encoder_operation = LlmBoostBinder(
-            "ATB", "qwen_DecoderModel")
+            self.backend_name, CPP_QWEN_MODEL_CLASS_NAME
+        )
         self.atb_decoder_operation = LlmBoostBinder(
-            "ATB", "qwen_DecoderModel")
+            self.backend_name, CPP_QWEN_MODEL_CLASS_NAME
+        )
     def init(self):
         """set param"""
@@ -42,24 +47,43 @@ class QwenBoost(AtbBoostBase):
             "withEmbedding": True,
             "isEmbeddingParallel": True,
             "isLmHeadParallel": True,
-            "linearTransposeType": [[1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)],
+            "linearTransposeType": [
+                [1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)
+            ],
             "lmHeadTransposeType": 1,
-            "supportSwiGLU": not self.need_nz,
-            "rmsNormEps": self.config.rms_norm_eps,
+            "enableSwiGLU": not self.need_nz,
+            "normEps": self.config.rms_norm_eps,
+            "normType": NormType.RMS_NORM,
             "numAttentionHeadsPerRank": self.config.num_heads // self.device_num,
             "hiddenSizePerAttentionHead": self.head_dim,
             "numHiddenLayers": self.num_layers,
             "numKeyValueHeadsPerRank": self.n_kv_heads // self.device_num,
             "rank": self.rank_id,
             "worldSize": self.device_num,
-            "backend": "lccl",
+            "backend": self.config.communication_backend,
             "packQuantType": [[1, 1] for _ in range(self.num_layers)],
-            "linearQuantType": [[0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)],
-            "kvQuant": self.kv_quant is not None,
+            "linearQuantType": [
+                [0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)
+            ],
+            "linearHasBias": [[True, False, False, False]] * self.num_layers,
+            "enableKvQuant": self.kv_quant is not None,
+            "enableLora": False,
+            "isUnpadInputs": True,
+            "enableAddNorm": False,
+        }
+        encoder_param = {
+            **param_dict,
+            "isPrefill": True,
+            "enableLcoc": False,
+            "enableSplitFuse": False,
+        }
+        decoder_param = {
+            **param_dict,
+            "isPrefill": False,
+            "enableLcoc": False,
+            "enableSpeculate": False,
+            "enablePrefixCache": False,
         }
-        encoder_param = {**param_dict, "isPrefill": True, "supportLcoc": False}
-        decoder_param = {**param_dict, "isPrefill": False,
-                         "supportLcoc": False, "supportSpeculate": False}
         self.atb_encoder_operation.init(json.dumps({**encoder_param}))
         self.atb_decoder_operation.init(json.dumps({**decoder_param}))
@@ -79,13 +103,14 @@ class QwenBoost(AtbBoostBase):
             **kwargs
     ):
         """prepare inputs"""
-        self.acl_param = json.dumps({
-            "seqLen": seqLen,
-        })
-        self.acl_decoder_operation_inputs[0] = self.cast(
-            input_ids, mstype.int64)
-        self.acl_decoder_operation_inputs[1] = self.cast(
-            position_ids, mstype.int32)
+        self.acl_param = json.dumps(
+            {
+                "seqLen": seqLen,
+            }
+        )
+        self.acl_decoder_operation_inputs[0] = input_ids
+        self.acl_decoder_operation_inputs[1] = position_ids
         self.acl_decoder_operation_inputs[2] = cos_embed
         self.acl_decoder_operation_inputs[3] = sin_embed
         self.acl_decoder_operation_inputs[4] = attention_mask
@@ -93,9 +118,7 @@ class QwenBoost(AtbBoostBase):
         self.acl_decoder_operation_inputs[6] = slots
         self.acl_decoder_operation_inputs[7] = self.placeholder
         self.acl_decoder_operation_inputs[8] = self.placeholder
-        self.acl_decoder_operation_inputs[9] = self.cast(
-            input_lengths, mstype.int32)
-        self.acl_decoder_operation_inputs[10] = self.cast(
-            lm_head_indices, mstype.int64)
-        self.acl_decoder_operation_inputs[11] = self.placeholder
+        self.acl_decoder_operation_inputs[9] = self.placeholder
+        self.acl_decoder_operation_inputs[10] = input_lengths
+        self.acl_decoder_operation_inputs[11] = lm_head_indices
         return self.acl_decoder_operation_inputs, self.acl_param

mindspore 2.4.0__cp39-none-any.whl → 2.4.10__cp39-none-any.whl

Potentially problematic release.

mindspore 2.4.0cp39-none-any.whl → 2.4.10cp39-none-any.whl