PyPI - mindspore - Versions diffs - 2.4.1__cp39-cp39-macosx_11_0_arm64.whl → 2.4.10__cp39-cp39-macosx_11_0_arm64.whl - Mend

mindspore 2.4.1__cp39-cp39-macosx_11_0_arm64.whl → 2.4.10__cp39-cp39-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mindspore might be problematic. Click here for more details.

Files changed (56) hide show

mindspore/.commit_id +1 -1
mindspore/_c_dataengine.cpython-39-darwin.so +0 -0
mindspore/_c_expression.cpython-39-darwin.so +0 -0
mindspore/_c_mindrecord.cpython-39-darwin.so +0 -0
mindspore/common/api.py +1 -4
mindspore/common/file_system.py +2 -0
mindspore/common/parameter.py +1 -14
mindspore/communication/_comm_helper.py +5 -0
mindspore/context.py +7 -2
mindspore/dataset/engine/datasets_standard_format.py +17 -0
mindspore/dataset/engine/datasets_user_defined.py +27 -1
mindspore/experimental/llm_boost/__init__.py +2 -2
mindspore/experimental/llm_boost/atb/boost_base.py +240 -64
mindspore/experimental/llm_boost/atb/llama_boost.py +46 -29
mindspore/experimental/llm_boost/atb/qwen_boost.py +47 -24
mindspore/include/api/context.h +1 -1
mindspore/include/dataset/constants.h +2 -2
mindspore/lib/libavcodec.59.dylib +0 -0
mindspore/lib/libavdevice.59.dylib +0 -0
mindspore/lib/libavfilter.8.dylib +0 -0
mindspore/lib/libavformat.59.dylib +0 -0
mindspore/lib/libavutil.57.dylib +0 -0
mindspore/lib/libmindspore_backend.dylib +0 -0
mindspore/lib/libmindspore_common.dylib +0 -0
mindspore/lib/libmindspore_core.dylib +0 -0
mindspore/lib/libmindspore_gpr.15.dylib +0 -0
mindspore/lib/libmindspore_grpc++.1.dylib +0 -0
mindspore/lib/libmindspore_grpc.15.dylib +0 -0
mindspore/lib/libmindspore_ops.dylib +0 -0
mindspore/lib/libswresample.4.dylib +0 -0
mindspore/lib/libswscale.6.dylib +0 -0
mindspore/nn/__init__.py +2 -0
mindspore/nn/cell.py +16 -2
mindspore/nn/layer/conv.py +3 -0
mindspore/nn/layer/pooling.py +8 -10
mindspore/nn/utils/__init__.py +22 -0
mindspore/nn/utils/init.py +71 -0
mindspore/ops/_grad_experimental/grad_comm_ops.py +25 -7
mindspore/ops/auto_generate/gen_ops_prim.py +3 -2
mindspore/ops/function/math_func.py +5 -4
mindspore/ops/operations/comm_ops.py +4 -1
mindspore/ops/operations/custom_ops.py +6 -4
mindspore/ops/operations/nn_ops.py +7 -2
mindspore/parallel/_auto_parallel_context.py +23 -4
mindspore/parallel/_cell_wrapper.py +22 -3
mindspore/parallel/_utils.py +0 -1
mindspore/run_check/_check_version.py +17 -8
mindspore/train/callback/_tft_register.py +7 -6
mindspore/train/model.py +1 -0
mindspore/train/serialization.py +4 -1
mindspore/version.py +1 -1
{mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/METADATA +2 -2
{mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/RECORD +56 -54
{mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/WHEEL +0 -0
{mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/entry_points.txt +0 -0
{mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/top_level.txt +0 -0

mindspore/.commit_id CHANGED Viewed

	@@ -1 +1 @@
1	- __commit_id__ = ''[sha1]:~~01847825~~,[branch]:(HEAD,origin/r2.4.1,r2.4.1)''
1	+ __commit_id__ = ''[sha1]:8e2ae935,[branch]:(HEAD,origin/r2.4.1,r2.4.1)''

mindspore/_c_dataengine.cpython-39-darwin.so CHANGED Viewed

Binary file

mindspore/_c_expression.cpython-39-darwin.so CHANGED Viewed

Binary file

mindspore/_c_mindrecord.cpython-39-darwin.so CHANGED Viewed

Binary file

mindspore/common/api.py CHANGED Viewed

@@ -1703,7 +1703,6 @@ class _CellGraphExecutor:
         self._graph_executor = GraphExecutor_.get_instance()
         self._graph_executor.set_py_exe_path(sys.executable)
         self._graph_executor.set_kernel_build_server_dir(os.path.split(kernel_build_server.__file__)[0] + os.sep)
-        self._pid = os.getpid()
     def init_dataset(self, queue_name, dataset_size, batch_size, dataset_types, dataset_shapes,
                      input_indexs, phase='dataset', need_run=True):
@@ -1934,9 +1933,7 @@ class _CellGraphExecutor:
     def del_net_res(self, obj, net_id):
         """Clear the memory resource of a network."""
-        # no need to del net res by gc in independent dataset process which is a subprocess forked by main process
-        if self._pid == os.getpid():
-            self._graph_executor.del_net_res(obj, net_id)
+        self._graph_executor.del_net_res(obj, net_id)
     def _get_branch_control_input(self):
         if ('obf_ratio' not in self.obfuscate_config.keys()) or (

mindspore/common/file_system.py CHANGED Viewed

@@ -22,6 +22,7 @@ class FileSystem:
         self.create_args = ("ab",)
         self.open = open
         self.open_args = ("rb",)
+        self.backend = "basic"
 def _register_basic_file_system(fs: FileSystem):
@@ -45,4 +46,5 @@ def _register_mindio_file_system(fs: FileSystem):
     fs.create_args = ()
     fs.open = mindio.open_file
     fs.open_args = ()
+    fs.backend = "mindio"
     return True

mindspore/common/parameter.py CHANGED Viewed

@@ -22,7 +22,6 @@ import os
 import sys
 import math
 import numbers
-from contextlib import contextmanager
 import numpy as np
 from mindspore import log as logger
 from mindspore.log import _LogActionOnce
@@ -55,16 +54,6 @@ PARAMETER_NAME_PREFIX_MAX_LEN = 1024
 _GLOBAL_PARAMETER_KEY = -1
-@contextmanager
-def no_init_parameters():
-    init_class = globals()["Parameter"]
-    setattr(init_class, "init_param", False)
-    try:
-        yield
-    finally:
-        setattr(init_class, "init_param", True)
 def _is_in_auto_parallel_mode():
     """Get parallel mode."""
     return auto_parallel_context().get_parallel_mode() in ["semi_auto_parallel", "auto_parallel"]
@@ -988,9 +977,7 @@ class Parameter(Tensor_):
         """
         if self.is_default_input_init and self.is_in_parallel != _is_in_auto_parallel_mode():
             raise RuntimeError("Must set or change parallel mode before any initializer Tensor created.")
-        if hasattr(self, "init_param") and self.init_param:
-            return self
-        if self.init_mode is None:
+        if self.init_mode is None or not self.has_init:
             return self
         if self.inited_param is not None:
             return self.inited_param

mindspore/communication/_comm_helper.py CHANGED Viewed

@@ -499,3 +499,8 @@ def _destroy_group_helper(group):
         hccl.create_group(group)
     else:
         CollectiveManager.get_instance().destroy_group(group)
+def _get_group_map():
+    """Get the group map"""
+    return CollectiveManager.get_instance().get_group_map()

mindspore/context.py CHANGED Viewed

@@ -936,6 +936,7 @@ def set_auto_parallel_context(**kwargs):
                \                 group_ckpt_save_file
                \                 auto_pipeline
                \                 dump_local_norm
+               \                 dump_device_local_norm
     ===========================  ===========================
     Args:
@@ -1090,6 +1091,9 @@ def set_auto_parallel_context(**kwargs):
         dump_local_norm (bool): Whether to dump local_norm value, when the `parallel_mode` is set to
                         ``semi_auto_parallel`` or ``auto_parallel``.
                         Default: ``False`` .
+        dump_device_local_norm (bool): Whether to dump device_local_norm value, when the `parallel_mode` is set to
+                        ``semi_auto_parallel`` or ``auto_parallel``.
+                        Default: ``False`` .
     Raises:
         ValueError: If input key is not attribute in auto parallel context.
@@ -1165,8 +1169,9 @@ def reset_auto_parallel_context():
     - pipeline_stages: 1.
     - pipeline_result_broadcast: False.
     - fusion_threshold: 64.
-    - dump_local_norm: False.
     - auto_pipeline: False.
+    - dump_local_norm: False.
+    - dump_device_local_norm: False.
     Examples:
         >>> import mindspore as ms
@@ -1793,7 +1798,7 @@ def set_context(**kwargs):
             When both exist simultaneously, the global jit config will not overwrite the local network's jit config.
             - jit_level (str): Used to control the compilation optimization level. Default: ``""`` , The framework
-              automatically selects the execution method based on product, Altas training product is O2, and all other
+              automatically selects the execution method based on product, Atlas training product is O2, and all other
               products are O0. In addition, The option of the dynamic shape must be O0 or O1, O2 is not supported.
               The value range is as follows:

mindspore/dataset/engine/datasets_standard_format.py CHANGED Viewed

@@ -33,6 +33,7 @@ from .datasets import UnionBaseDataset, SourceDataset, MappableDataset, Shuffle,
 from .datasets_user_defined import GeneratorDataset
 from .obs.obs_mindrecord_dataset import MindRecordFromOBS
 from .validators import check_csvdataset, check_minddataset, check_tfrecorddataset, check_obsminddataset
+from ..core.validator_helpers import type_check
 from ...mindrecord.config import _get_enc_key, _get_dec_mode, _get_hash_mode, decrypt, verify_file_hash
@@ -301,6 +302,22 @@ class MindDataset(MappableDataset, UnionBaseDataset):
                 else:
                     self.new_padded_sample[k] = v
+    def __deepcopy__(self, memodict):
+        if id(self) in memodict:
+            return memodict[id(self)]
+        return self.__safe_deepcopy__(memodict, exclude=("mindrecord_op"))
+    def __getitem__(self, index):
+        type_check(index, (int,), "index")
+        if index < 0:
+            raise ValueError("index cannot be negative, but got {0}.".format(index))
+        if not hasattr(self, "mindrecord_op"):
+            minddata_node = cde.MindDataNode(
+                self.dataset_files, self.columns_list, self.sampler, self.new_padded_sample,
+                self.num_padded, shuffle_to_shuffle_mode(self.shuffle_option))
+            self.mindrecord_op = minddata_node.Build()
+        return [t.as_array() for t in self.mindrecord_op[index]]
 class TFRecordDataset(SourceDataset, UnionBaseDataset):
     """

mindspore/dataset/engine/datasets_user_defined.py CHANGED Viewed

@@ -19,6 +19,7 @@ After declaring the dataset object, you can further apply dataset operations
 (e.g. filter, skip, concat, map, batch) on it.
 """
 import builtins
+import copy
 import errno
 import itertools
 import math
@@ -50,6 +51,7 @@ from ..core.config import get_enable_shared_mem, get_prefetch_size, get_multipro
     get_enable_watchdog, get_debug_mode, get_seed, set_seed
 from ..core.datatypes import mstypelist_to_detypelist
 from ..core.py_util_helpers import ExceptionHandler
+from ..core.validator_helpers import type_check
 from ..transforms import transforms
@@ -427,6 +429,10 @@ class SamplerFn(cde.PythonMultiprocessingRuntime):
                 subprocess_file_descriptor = w.sentinel
                 st = time.time()
                 while _PythonMultiprocessing.is_process_alive(w.pid):
+                    process = psutil.Process(w.pid)
+                    if process.status() == psutil.STATUS_ZOMBIE:
+                        process.kill()
+                        break
                     time.sleep(0.01)  # sleep 10ms, waiting for the subprocess exit
                     if time.time() - st > check_interval:
                         logger.warning("Waiting for the subprocess worker [{}] to exit.".format(w.pid))
@@ -469,7 +475,7 @@ class SamplerFn(cde.PythonMultiprocessingRuntime):
                         # let the quit event notify the worker process to exit
                         w.join(timeout=5)
-                        if w.is_alive():
+                        if _PythonMultiprocessing.is_process_alive(w.pid):
                             # if the worker process did not exit, it may hang, try to terminate it
                             w.terminate()
                             w.close()
@@ -907,6 +913,26 @@ class GeneratorDataset(MappableDataset, UnionBaseDataset):
             return memodict[id(self)]
         return self.__safe_deepcopy__(memodict, exclude=("source", "__transfer_dataset__"))
+    def __getitem__(self, index):
+        type_check(index, (int, np.number), "index")
+        if not hasattr(self.source, "__getitem__"):
+            raise RuntimeError("Dataset don't support randomized access.")
+        if not hasattr(self, "generator_op"):
+            dataset = copy.deepcopy(self)
+            self.prepared_source = _generator_fn_wrapper(_cpp_sampler_fn, self.source)
+            if self.schema is None:
+                dataset.generator_node = cde.GeneratorNode(self.prepared_source, self.column_names, self.column_types,
+                                                           self.source_len, self.sampler, 1, None)
+            else:
+                schema = self.schema
+                if isinstance(schema, Schema):
+                    schema = self.schema.cpp_schema
+                dataset.generator_node = cde.GeneratorNode(self.prepared_source, schema, self.source_len,
+                                                           self.sampler, 1, None)
+            self.generator_op = dataset.generator_node.Build()
+        sample_id = self.generator_op.GetMappedIndex(index)
+        return self.source[sample_id]
     def is_shuffled(self):
         if self.sampler:
             return self.sampler.is_shuffled()

mindspore/experimental/llm_boost/__init__.py CHANGED Viewed

@@ -15,7 +15,7 @@
 """LlmBoost Register"""
 from __future__ import absolute_import
-from mindspore.experimental.llm_boost.atb import *
+from mindspore.experimental.llm_boost.atb import LlamaBoost, QwenBoost
 from mindspore.experimental.llm_boost.register import LlmBoostRegister
-__all__ = ['LlmBoostRegister']
+__all__ = ["LlmBoostRegister"]

mindspore/experimental/llm_boost/atb/boost_base.py CHANGED Viewed

@@ -13,17 +13,32 @@
 # limitations under the License.
 # ============================================================================
 """boost base class"""
+from enum import Enum
 import numpy as np
 import mindspore as ms
 from mindspore import ops, Tensor
+from mindspore import log as logger
 from mindspore.ops import operations as P
 import mindspore.common.dtype as mstype
 from mindspore._c_expression import _set_format
 from mindspore.common.parameter import Parameter
 from mindspore.experimental.llm_boost.utils import get_real_rank, get_real_group_size
 from mindspore.common.initializer import Zero
+FORMAT_NZ = "FRACTAL_NZ"
+BUILDIN_BACKEND_NAME = "ATB"
+class PositionEmbeddingType(int, Enum):
+    ROPE = 0
+    ALIBI = 1
+    ABSOLUTE = 2
+class NormType(int, Enum):
+    RMS_NORM = 0
+    LAYER_NORM = 1
 class AttentionMask:
     """attention mask"""
@@ -31,30 +46,34 @@ class AttentionMask:
     @classmethod
     def static(cls, max_seq_len, dtype=mstype.float16, need_nz=False):
         """cache mask"""
-        bias_cache = Tensor(np.tril(np.ones((max_seq_len, max_seq_len), dtype=np.bool_))).reshape(max_seq_len,
-                                                                                                  max_seq_len)
+        bias_cache = Tensor(
+            np.tril(np.ones((max_seq_len, max_seq_len), dtype=np.bool_))
+        ).reshape(max_seq_len, max_seq_len)
         bias_cache = ~bias_cache
         if dtype == mstype.float16:
             mask_value = Tensor(np.finfo(np.float32).min, mstype.float16)
         else:
             mask_value = Tensor(1)
-        attn_mask = ops.masked_fill(Tensor(np.zeros(
-            (max_seq_len, max_seq_len)), dtype=mstype.float16), bias_cache, mask_value)
+        attn_mask = ops.masked_fill(
+            Tensor(np.zeros((max_seq_len, max_seq_len)), dtype=mstype.float16),
+            bias_cache,
+            mask_value,
+        )
         if need_nz:
             # ND -> NZ
             attn_mask = ops.reshape(attn_mask, (1, max_seq_len, max_seq_len))
-            attn_mask = ops.reshape(
-                attn_mask, (1, max_seq_len, max_seq_len // 16, 16))
+            attn_mask = ops.reshape(attn_mask, (1, max_seq_len, max_seq_len // 16, 16))
             attn_mask = ops.transpose(attn_mask, (0, 2, 1, 3)).contiguous()
-            attn_mask = _set_format(attn_mask, "FRACTAL_NZ")
+            attn_mask = _set_format(attn_mask, FORMAT_NZ)
         return attn_mask
-class AtbBoostBase():
+class AtbBoostBase:
     """atb boost base class"""
     def __init__(self, config):
         super().__init__()
+        self.backend_name = BUILDIN_BACKEND_NAME
         self.is_first_iteration = False
         self.config = config
         self.dtype = config.compute_dtype
@@ -68,27 +87,98 @@ class AtbBoostBase():
             self.need_nz = config.need_nz
         self.placeholder = Tensor(np.zeros(1), dtype=self.dtype)
         self.lm_head_indices_fake = Tensor([0], dtype=mstype.int64)
-        self.position_embedding_type = "ROPE"
+        self.position_embedding_type = PositionEmbeddingType.ROPE
         self.add_norm_enable = True
         self.max_decode_length = self.config.max_decode_length
         self.max_base_len = 128
         self.attn_mask = AttentionMask.static(
-            self.max_base_len, dtype=self.dtype, need_nz=self.need_nz)
+            self.max_base_len, dtype=self.dtype, need_nz=self.need_nz
+        )
         self.cast = P.Cast()
         self.reshape = P.Reshape()
         self.kv_quant = None
         self.rank_id = get_real_rank()
         self.device_num = get_real_group_size()
+        self.ascend_weight = []
+        self.k_caches = []
+        self.v_caches = []
     def _convert_tensor_format_and_dtype(self, tensor, dtype=mstype.float16):
         tensor = self.cast(tensor, dtype=dtype)
         if self.need_nz:
-            tensor = _set_format(tensor, "FRACTAL_NZ")
+            tensor = _set_format(tensor, FORMAT_NZ)
         return tensor
+    def _convert_qkv_concat_weight(self, param_dict):
+        """convert qkv concat weight"""
+        assume_num_layers = 500
+        for i in range(assume_num_layers):
+            # qkv weight concat
+            wq_weight_name = f"model.layers.{i}.attention.wq.weight"
+            wk_weight_name = f"model.layers.{i}.attention.wk.weight"
+            wv_weight_name = f"model.layers.{i}.attention.wv.weight"
+            qkv_concat_weight_name = f"model.layers.{i}.attention.w_qkv.weight"
+            if wq_weight_name not in param_dict:
+                break
+            wq_weight = param_dict[wq_weight_name].asnumpy()
+            wk_weight = param_dict[wk_weight_name].asnumpy()
+            wv_weight = param_dict[wv_weight_name].asnumpy()
+            qkv_weight = np.concatenate((wq_weight, wk_weight, wv_weight), 0)
+            param_dict[qkv_concat_weight_name] = Parameter(
+                qkv_weight, name=qkv_concat_weight_name
+            )
+            # gate hidden weight concat
+            ffn_gate_weight_name = f"model.layers.{i}.feed_forward.w1.weight"
+            ffn_hidden_weight_name = f"model.layers.{i}.feed_forward.w3.weight"
+            gate_hidden_concat_weight_name = (
+                f"model.layers.{i}.feed_forward.w_gate_hidden.weight"
+            )
+            ffn_gate_weight = param_dict[ffn_gate_weight_name].asnumpy()
+            ffn_hidden_weight = param_dict[ffn_hidden_weight_name].asnumpy()
+            gate_hidden_weight = np.concatenate((ffn_gate_weight, ffn_hidden_weight), 0)
+            param_dict[gate_hidden_concat_weight_name] = Parameter(
+                gate_hidden_weight, name=gate_hidden_concat_weight_name
+            )
+            param_dict.pop(wq_weight_name)
+            param_dict.pop(wk_weight_name)
+            param_dict.pop(wv_weight_name)
+            param_dict.pop(ffn_gate_weight_name)
+            param_dict.pop(ffn_hidden_weight_name)
+            logger.info(f"transform: {qkv_concat_weight_name}")
+            logger.info(f"transform: {gate_hidden_concat_weight_name}")
+        for i in range(assume_num_layers):
+            # qkv bias concat
+            wq_bias_name = f"model.layers.{i}.attention.wq.bias"
+            wk_bias_name = f"model.layers.{i}.attention.wk.bias"
+            wv_bias_name = f"model.layers.{i}.attention.wv.bias"
+            qkv_concat_bias_name = f"model.layers.{i}.attention.w_qkv.bias"
+            if wq_bias_name not in param_dict:
+                break
+            wq_bias_weight = param_dict[wq_bias_name].asnumpy()
+            wk_bias_weight = param_dict[wk_bias_name].asnumpy()
+            wv_bias_weight = param_dict[wv_bias_name].asnumpy()
+            qkv_bias_weight = np.concatenate(
+                (wq_bias_weight, wk_bias_weight, wv_bias_weight), 0
+            )
+            param_dict[qkv_concat_bias_name] = Parameter(
+                qkv_bias_weight, name=qkv_concat_bias_name
+            )
+            param_dict.pop(wq_bias_name)
+            param_dict.pop(wk_bias_name)
+            param_dict.pop(wv_bias_name)
+            logger.info(f"transform: {qkv_concat_bias_name}")
+        return param_dict
     def set_weights(self, parm_dict, dtype=mstype.float16):
         """set weights for llm boost"""
+        self._convert_qkv_concat_weight(parm_dict)
         embedding_weight_name = "model.tok_embeddings.embedding_weight"
         attention_norm_name = "attention_norm"
         qkv_name = "attention.w_qkv"
@@ -101,45 +191,88 @@ class AtbBoostBase():
         placeholder = Parameter(Tensor(np.zeros(1), dtype=dtype))
         ascend_weight = []
-        ascend_weight.append(
-            self.cast(parm_dict[embedding_weight_name], dtype))
+        ascend_weight.append(self.cast(parm_dict[embedding_weight_name], dtype))
         for i in range(self.num_layers):
-            ascend_weight.append(self._convert_tensor_format_and_dtype(
-                parm_dict[f"model.layers.{i}.{attention_norm_name}.weight"], dtype))
+            ascend_weight.append(
+                self._convert_tensor_format_and_dtype(
+                    parm_dict[f"model.layers.{i}.{attention_norm_name}.weight"], dtype
+                )
+            )
             ascend_weight.extend([placeholder] * 3)
             ascend_weight.append(
-                self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{qkv_name}.weight"], dtype))
-            ascend_weight.append(self._convert_tensor_format_and_dtype(parm_dict.get(
-                f"model.layers.{i}.{qkv_name}.bias", placeholder), dtype))
+                self._convert_tensor_format_and_dtype(
+                    parm_dict[f"model.layers.{i}.{qkv_name}.weight"], dtype
+                )
+            )
+            ascend_weight.append(
+                self._convert_tensor_format_and_dtype(
+                    parm_dict.get(f"model.layers.{i}.{qkv_name}.bias", placeholder),
+                    dtype,
+                )
+            )
             ascend_weight.extend([placeholder] * 16)
             ascend_weight.append(
-                self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{o_name}.weight"], dtype))
-            ascend_weight.append(self._convert_tensor_format_and_dtype(parm_dict.get(
-                f"model.layers.{i}.{o_name}.bias", placeholder), dtype))
+                self._convert_tensor_format_and_dtype(
+                    parm_dict[f"model.layers.{i}.{o_name}.weight"], dtype
+                )
+            )
+            ascend_weight.append(
+                self._convert_tensor_format_and_dtype(
+                    parm_dict.get(f"model.layers.{i}.{o_name}.bias", placeholder), dtype
+                )
+            )
             ascend_weight.extend([placeholder] * 4)
             ascend_weight.append(
-                self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{mlp_norm_name}.weight"], dtype))
+                self._convert_tensor_format_and_dtype(
+                    parm_dict[f"model.layers.{i}.{mlp_norm_name}.weight"], dtype
+                )
+            )
             ascend_weight.extend([placeholder] * 3)
             ascend_weight.append(
-                self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{mlp_gate_name}.weight"], dtype))
-            ascend_weight.append(self._convert_tensor_format_and_dtype(parm_dict.get(
-                f"model.layers.{i}.{mlp_gate_name}.bias", placeholder), dtype))
+                self._convert_tensor_format_and_dtype(
+                    parm_dict[f"model.layers.{i}.{mlp_gate_name}.weight"], dtype
+                )
+            )
+            ascend_weight.append(
+                self._convert_tensor_format_and_dtype(
+                    parm_dict.get(
+                        f"model.layers.{i}.{mlp_gate_name}.bias", placeholder
+                    ),
+                    dtype,
+                )
+            )
             ascend_weight.extend([placeholder] * 10)
             ascend_weight.append(
-                self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{mlp_down_name}.weight"], dtype))
-            ascend_weight.append(self._convert_tensor_format_and_dtype(parm_dict.get(
-                f"model.layers.{i}.{mlp_down_name}.bias", placeholder), dtype))
+                self._convert_tensor_format_and_dtype(
+                    parm_dict[f"model.layers.{i}.{mlp_down_name}.weight"], dtype
+                )
+            )
+            ascend_weight.append(
+                self._convert_tensor_format_and_dtype(
+                    parm_dict.get(
+                        f"model.layers.{i}.{mlp_down_name}.bias", placeholder
+                    ),
+                    dtype,
+                )
+            )
             ascend_weight.extend([placeholder] * 4)
         ascend_weight.append(
-            self._convert_tensor_format_and_dtype(parm_dict[f"{norm_out_name}.weight"], dtype))
+            self._convert_tensor_format_and_dtype(
+                parm_dict[f"{norm_out_name}.weight"], dtype
+            )
+        )
         ascend_weight.append(
-            self._convert_tensor_format_and_dtype(parm_dict[f"{lm_head_name}.weight"], dtype))
+            self._convert_tensor_format_and_dtype(
+                parm_dict[f"{lm_head_name}.weight"], dtype
+            )
+        )
+        self.ascend_weight = ascend_weight
         self.atb_encoder_operation.set_weights(ascend_weight)
         self.atb_decoder_operation.set_weights(ascend_weight)
@@ -147,20 +280,47 @@ class AtbBoostBase():
         """set kv_cache for llm boost"""
         if not k_caches or v_caches:
             if self.need_nz:
-                kv_shape = (self.config.num_blocks, self.num_kv_heads*self.head_dim //
-                            self.device_num // 16, self.config.block_size, 16)
-                k_caches = [_set_format(Parameter(Tensor(
-                    shape=kv_shape, dtype=self.dtype, init=Zero())), "FRACTAL_NZ") for _ in range(self.num_layers)]
-                v_caches = [_set_format(Parameter(Tensor(
-                    shape=kv_shape, dtype=self.dtype, init=Zero())), "FRACTAL_NZ") for _ in range(self.num_layers)]
+                kv_shape = (
+                    self.config.num_blocks,
+                    self.num_kv_heads * self.head_dim // self.device_num // 16,
+                    self.config.block_size,
+                    16,
+                )
+                k_caches = [
+                    _set_format(
+                        Parameter(
+                            Tensor(shape=kv_shape, dtype=self.dtype, init=Zero())
+                        ),
+                        FORMAT_NZ,
+                    )
+                    for _ in range(self.num_layers)
+                ]
+                v_caches = [
+                    _set_format(
+                        Parameter(
+                            Tensor(shape=kv_shape, dtype=self.dtype, init=Zero())
+                        ),
+                        FORMAT_NZ,
+                    )
+                    for _ in range(self.num_layers)
+                ]
             else:
-                kv_shape = (self.config.num_blocks, self.config.block_size,
-                            self.num_kv_heads // self.device_num, self.head_dim)
-                k_caches = [Parameter(Tensor(
-                    shape=kv_shape, dtype=self.dtype, init=Zero())) for _ in range(self.num_layers)]
-                v_caches = [Parameter(Tensor(
-                    shape=kv_shape, dtype=self.dtype, init=Zero())) for _ in range(self.num_layers)]
+                kv_shape = (
+                    self.config.num_blocks,
+                    self.config.block_size,
+                    self.num_kv_heads // self.device_num,
+                    self.head_dim,
+                )
+                k_caches = [
+                    Parameter(Tensor(shape=kv_shape, dtype=self.dtype, init=Zero()))
+                    for _ in range(self.num_layers)
+                ]
+                v_caches = [
+                    Parameter(Tensor(shape=kv_shape, dtype=self.dtype, init=Zero()))
+                    for _ in range(self.num_layers)
+                ]
+        self.k_caches = k_caches
+        self.v_caches = v_caches
         self.atb_encoder_operation.set_kvcache(k_caches, v_caches)
         self.atb_decoder_operation.set_kvcache(k_caches, v_caches)
@@ -171,11 +331,9 @@ class AtbBoostBase():
     def _execute_operator(self, acl_inputs, acl_param):
         """execute operator."""
         if self.is_first_iteration:
-            acl_model_out = self.atb_encoder_operation.forward(
-                acl_inputs, acl_param)
+            acl_model_out = self.atb_encoder_operation.forward(acl_inputs, acl_param)
         else:
-            acl_model_out = self.atb_decoder_operation.forward(
-                acl_inputs, acl_param)
+            acl_model_out = self.atb_decoder_operation.forward(acl_inputs, acl_param)
         acl_hidden_state = acl_model_out[0]
         return acl_hidden_state
@@ -183,28 +341,46 @@ class AtbBoostBase():
         r"""
         LlmBoost forward.
         """
-        input_ids = boost_inputs["input_ids"]
-        position_ids = boost_inputs["position_ids"]
-        cos_embed = boost_inputs["cos_embed"]
-        sin_embed = boost_inputs["sin_embed"]
-        block_tables = boost_inputs["block_tables"]
-        slot_mapping = boost_inputs["slot_mapping"]
-        batch_valid_length = boost_inputs["batch_valid_length"]
-        lm_head_indices = boost_inputs["lm_head_indices"]
-        seqLen = boost_inputs["seq_lens"]
+        input_ids = boost_inputs.get("input_ids", None)
+        position_ids = boost_inputs.get("position_ids", None)
+        cos_embed = boost_inputs.get("cos_embed", None)
+        sin_embed = boost_inputs.get("sin_embed", None)
+        block_tables = boost_inputs.get("block_tables", None)
+        slot_mapping = boost_inputs.get("slot_mapping", None)
+        batch_valid_length = boost_inputs.get("batch_valid_length", None)
+        lm_head_indices = boost_inputs.get("lm_head_indices", None)
+        seqLen = boost_inputs.get("seq_lens", None)
+        input_ids = self.reshape(input_ids, (-1,))
         if self.is_first_iteration:
             attention_mask = self.attn_mask
         else:
-            position_ids = batch_valid_length - 1
+            if position_ids is None:
+                position_ids = batch_valid_length - 1
             attention_mask = self.placeholder
             lm_head_indices = self.lm_head_indices_fake
-        acl_inputs, acl_param = self._prepare_inputs(prefill=self.is_first_iteration, input_ids=input_ids,
-                                                     position_ids=position_ids, cos_embed=cos_embed,
-                                                     sin_embed=sin_embed, attention_mask=attention_mask,
-                                                     block_tables=block_tables, slots=slot_mapping,
-                                                     input_lengths=batch_valid_length, lm_head_indices=lm_head_indices,
-                                                     seqLen=seqLen)
+        if input_ids is not None and input_ids.dtype != mstype.int64:
+            input_ids = self.cast(input_ids, mstype.int64)
+        if position_ids is not None and position_ids.dtype != mstype.int64:
+            position_ids = self.cast(position_ids, mstype.int64)
+        if batch_valid_length is not None and batch_valid_length.dtype != mstype.int32:
+            batch_valid_length = self.cast(batch_valid_length, mstype.int32)
+        if lm_head_indices is not None and lm_head_indices.dtype != mstype.int64:
+            lm_head_indices = self.cast(lm_head_indices, mstype.int64)
+        acl_inputs, acl_param = self._prepare_inputs(
+            prefill=self.is_first_iteration,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cos_embed=cos_embed,
+            sin_embed=sin_embed,
+            attention_mask=attention_mask,
+            block_tables=block_tables,
+            slots=slot_mapping,
+            input_lengths=batch_valid_length,
+            lm_head_indices=lm_head_indices,
+            seqLen=seqLen,
+        )
         ms.hal.synchronize()
         logits = self._execute_operator(acl_inputs, acl_param)
         logits = self.cast(logits, mstype.float32)