mindspore 2.4.1__cp39-cp39-win_amd64.whl → 2.4.10__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/common/api.py +1 -4
- mindspore/common/file_system.py +2 -0
- mindspore/common/parameter.py +1 -14
- mindspore/communication/_comm_helper.py +5 -0
- mindspore/context.py +7 -2
- mindspore/dataset/engine/datasets_standard_format.py +17 -0
- mindspore/dataset/engine/datasets_user_defined.py +27 -1
- mindspore/dnnl.dll +0 -0
- mindspore/experimental/llm_boost/__init__.py +2 -2
- mindspore/experimental/llm_boost/atb/boost_base.py +240 -64
- mindspore/experimental/llm_boost/atb/llama_boost.py +46 -29
- mindspore/experimental/llm_boost/atb/qwen_boost.py +47 -24
- mindspore/include/api/context.h +1 -1
- mindspore/include/dataset/constants.h +2 -2
- mindspore/jpeg62.dll +0 -0
- mindspore/mindspore_backend.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_np_dtype.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/nn/__init__.py +2 -0
- mindspore/nn/cell.py +16 -2
- mindspore/nn/layer/conv.py +3 -0
- mindspore/nn/layer/pooling.py +8 -10
- mindspore/nn/utils/__init__.py +22 -0
- mindspore/nn/utils/init.py +71 -0
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/_grad_experimental/grad_comm_ops.py +25 -7
- mindspore/ops/auto_generate/gen_ops_prim.py +3 -2
- mindspore/ops/function/math_func.py +5 -4
- mindspore/ops/operations/comm_ops.py +4 -1
- mindspore/ops/operations/custom_ops.py +6 -4
- mindspore/ops/operations/nn_ops.py +7 -2
- mindspore/parallel/_auto_parallel_context.py +23 -4
- mindspore/parallel/_cell_wrapper.py +22 -3
- mindspore/parallel/_utils.py +0 -1
- mindspore/run_check/_check_version.py +17 -8
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/callback/_tft_register.py +7 -6
- mindspore/train/model.py +1 -0
- mindspore/train/serialization.py +4 -1
- mindspore/turbojpeg.dll +0 -0
- mindspore/version.py +1 -1
- {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/METADATA +2 -2
- {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/RECORD +62 -60
- {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/WHEEL +0 -0
- {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/entry_points.txt +0 -0
- {mindspore-2.4.1.dist-info → mindspore-2.4.10.dist-info}/top_level.txt +0 -0
|
@@ -13,17 +13,32 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
# ============================================================================
|
|
15
15
|
"""boost base class"""
|
|
16
|
+
from enum import Enum
|
|
16
17
|
import numpy as np
|
|
17
18
|
import mindspore as ms
|
|
18
19
|
from mindspore import ops, Tensor
|
|
20
|
+
from mindspore import log as logger
|
|
19
21
|
from mindspore.ops import operations as P
|
|
20
22
|
import mindspore.common.dtype as mstype
|
|
21
23
|
from mindspore._c_expression import _set_format
|
|
22
|
-
|
|
23
24
|
from mindspore.common.parameter import Parameter
|
|
24
25
|
from mindspore.experimental.llm_boost.utils import get_real_rank, get_real_group_size
|
|
25
26
|
from mindspore.common.initializer import Zero
|
|
26
27
|
|
|
28
|
+
FORMAT_NZ = "FRACTAL_NZ"
|
|
29
|
+
BUILDIN_BACKEND_NAME = "ATB"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PositionEmbeddingType(int, Enum):
|
|
33
|
+
ROPE = 0
|
|
34
|
+
ALIBI = 1
|
|
35
|
+
ABSOLUTE = 2
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class NormType(int, Enum):
|
|
39
|
+
RMS_NORM = 0
|
|
40
|
+
LAYER_NORM = 1
|
|
41
|
+
|
|
27
42
|
|
|
28
43
|
class AttentionMask:
|
|
29
44
|
"""attention mask"""
|
|
@@ -31,30 +46,34 @@ class AttentionMask:
|
|
|
31
46
|
@classmethod
|
|
32
47
|
def static(cls, max_seq_len, dtype=mstype.float16, need_nz=False):
|
|
33
48
|
"""cache mask"""
|
|
34
|
-
bias_cache = Tensor(
|
|
35
|
-
|
|
49
|
+
bias_cache = Tensor(
|
|
50
|
+
np.tril(np.ones((max_seq_len, max_seq_len), dtype=np.bool_))
|
|
51
|
+
).reshape(max_seq_len, max_seq_len)
|
|
36
52
|
bias_cache = ~bias_cache
|
|
37
53
|
if dtype == mstype.float16:
|
|
38
54
|
mask_value = Tensor(np.finfo(np.float32).min, mstype.float16)
|
|
39
55
|
else:
|
|
40
56
|
mask_value = Tensor(1)
|
|
41
|
-
attn_mask = ops.masked_fill(
|
|
42
|
-
(max_seq_len, max_seq_len)), dtype=mstype.float16),
|
|
57
|
+
attn_mask = ops.masked_fill(
|
|
58
|
+
Tensor(np.zeros((max_seq_len, max_seq_len)), dtype=mstype.float16),
|
|
59
|
+
bias_cache,
|
|
60
|
+
mask_value,
|
|
61
|
+
)
|
|
43
62
|
if need_nz:
|
|
44
63
|
# ND -> NZ
|
|
45
64
|
attn_mask = ops.reshape(attn_mask, (1, max_seq_len, max_seq_len))
|
|
46
|
-
attn_mask = ops.reshape(
|
|
47
|
-
attn_mask, (1, max_seq_len, max_seq_len // 16, 16))
|
|
65
|
+
attn_mask = ops.reshape(attn_mask, (1, max_seq_len, max_seq_len // 16, 16))
|
|
48
66
|
attn_mask = ops.transpose(attn_mask, (0, 2, 1, 3)).contiguous()
|
|
49
|
-
attn_mask = _set_format(attn_mask,
|
|
67
|
+
attn_mask = _set_format(attn_mask, FORMAT_NZ)
|
|
50
68
|
return attn_mask
|
|
51
69
|
|
|
52
70
|
|
|
53
|
-
class AtbBoostBase
|
|
71
|
+
class AtbBoostBase:
|
|
54
72
|
"""atb boost base class"""
|
|
55
73
|
|
|
56
74
|
def __init__(self, config):
|
|
57
75
|
super().__init__()
|
|
76
|
+
self.backend_name = BUILDIN_BACKEND_NAME
|
|
58
77
|
self.is_first_iteration = False
|
|
59
78
|
self.config = config
|
|
60
79
|
self.dtype = config.compute_dtype
|
|
@@ -68,27 +87,98 @@ class AtbBoostBase():
|
|
|
68
87
|
self.need_nz = config.need_nz
|
|
69
88
|
self.placeholder = Tensor(np.zeros(1), dtype=self.dtype)
|
|
70
89
|
self.lm_head_indices_fake = Tensor([0], dtype=mstype.int64)
|
|
71
|
-
self.position_embedding_type =
|
|
90
|
+
self.position_embedding_type = PositionEmbeddingType.ROPE
|
|
72
91
|
self.add_norm_enable = True
|
|
73
92
|
self.max_decode_length = self.config.max_decode_length
|
|
74
93
|
self.max_base_len = 128
|
|
75
94
|
self.attn_mask = AttentionMask.static(
|
|
76
|
-
self.max_base_len, dtype=self.dtype, need_nz=self.need_nz
|
|
95
|
+
self.max_base_len, dtype=self.dtype, need_nz=self.need_nz
|
|
96
|
+
)
|
|
77
97
|
|
|
78
98
|
self.cast = P.Cast()
|
|
79
99
|
self.reshape = P.Reshape()
|
|
80
100
|
self.kv_quant = None
|
|
81
101
|
self.rank_id = get_real_rank()
|
|
82
102
|
self.device_num = get_real_group_size()
|
|
103
|
+
self.ascend_weight = []
|
|
104
|
+
self.k_caches = []
|
|
105
|
+
self.v_caches = []
|
|
83
106
|
|
|
84
107
|
def _convert_tensor_format_and_dtype(self, tensor, dtype=mstype.float16):
|
|
85
108
|
tensor = self.cast(tensor, dtype=dtype)
|
|
86
109
|
if self.need_nz:
|
|
87
|
-
tensor = _set_format(tensor,
|
|
110
|
+
tensor = _set_format(tensor, FORMAT_NZ)
|
|
88
111
|
return tensor
|
|
89
112
|
|
|
113
|
+
def _convert_qkv_concat_weight(self, param_dict):
|
|
114
|
+
"""convert qkv concat weight"""
|
|
115
|
+
assume_num_layers = 500
|
|
116
|
+
for i in range(assume_num_layers):
|
|
117
|
+
# qkv weight concat
|
|
118
|
+
wq_weight_name = f"model.layers.{i}.attention.wq.weight"
|
|
119
|
+
wk_weight_name = f"model.layers.{i}.attention.wk.weight"
|
|
120
|
+
wv_weight_name = f"model.layers.{i}.attention.wv.weight"
|
|
121
|
+
qkv_concat_weight_name = f"model.layers.{i}.attention.w_qkv.weight"
|
|
122
|
+
if wq_weight_name not in param_dict:
|
|
123
|
+
break
|
|
124
|
+
wq_weight = param_dict[wq_weight_name].asnumpy()
|
|
125
|
+
wk_weight = param_dict[wk_weight_name].asnumpy()
|
|
126
|
+
wv_weight = param_dict[wv_weight_name].asnumpy()
|
|
127
|
+
qkv_weight = np.concatenate((wq_weight, wk_weight, wv_weight), 0)
|
|
128
|
+
param_dict[qkv_concat_weight_name] = Parameter(
|
|
129
|
+
qkv_weight, name=qkv_concat_weight_name
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# gate hidden weight concat
|
|
133
|
+
ffn_gate_weight_name = f"model.layers.{i}.feed_forward.w1.weight"
|
|
134
|
+
ffn_hidden_weight_name = f"model.layers.{i}.feed_forward.w3.weight"
|
|
135
|
+
gate_hidden_concat_weight_name = (
|
|
136
|
+
f"model.layers.{i}.feed_forward.w_gate_hidden.weight"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
ffn_gate_weight = param_dict[ffn_gate_weight_name].asnumpy()
|
|
140
|
+
ffn_hidden_weight = param_dict[ffn_hidden_weight_name].asnumpy()
|
|
141
|
+
gate_hidden_weight = np.concatenate((ffn_gate_weight, ffn_hidden_weight), 0)
|
|
142
|
+
param_dict[gate_hidden_concat_weight_name] = Parameter(
|
|
143
|
+
gate_hidden_weight, name=gate_hidden_concat_weight_name
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
param_dict.pop(wq_weight_name)
|
|
147
|
+
param_dict.pop(wk_weight_name)
|
|
148
|
+
param_dict.pop(wv_weight_name)
|
|
149
|
+
param_dict.pop(ffn_gate_weight_name)
|
|
150
|
+
param_dict.pop(ffn_hidden_weight_name)
|
|
151
|
+
logger.info(f"transform: {qkv_concat_weight_name}")
|
|
152
|
+
logger.info(f"transform: {gate_hidden_concat_weight_name}")
|
|
153
|
+
|
|
154
|
+
for i in range(assume_num_layers):
|
|
155
|
+
# qkv bias concat
|
|
156
|
+
wq_bias_name = f"model.layers.{i}.attention.wq.bias"
|
|
157
|
+
wk_bias_name = f"model.layers.{i}.attention.wk.bias"
|
|
158
|
+
wv_bias_name = f"model.layers.{i}.attention.wv.bias"
|
|
159
|
+
qkv_concat_bias_name = f"model.layers.{i}.attention.w_qkv.bias"
|
|
160
|
+
if wq_bias_name not in param_dict:
|
|
161
|
+
break
|
|
162
|
+
|
|
163
|
+
wq_bias_weight = param_dict[wq_bias_name].asnumpy()
|
|
164
|
+
wk_bias_weight = param_dict[wk_bias_name].asnumpy()
|
|
165
|
+
wv_bias_weight = param_dict[wv_bias_name].asnumpy()
|
|
166
|
+
qkv_bias_weight = np.concatenate(
|
|
167
|
+
(wq_bias_weight, wk_bias_weight, wv_bias_weight), 0
|
|
168
|
+
)
|
|
169
|
+
param_dict[qkv_concat_bias_name] = Parameter(
|
|
170
|
+
qkv_bias_weight, name=qkv_concat_bias_name
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
param_dict.pop(wq_bias_name)
|
|
174
|
+
param_dict.pop(wk_bias_name)
|
|
175
|
+
param_dict.pop(wv_bias_name)
|
|
176
|
+
logger.info(f"transform: {qkv_concat_bias_name}")
|
|
177
|
+
return param_dict
|
|
178
|
+
|
|
90
179
|
def set_weights(self, parm_dict, dtype=mstype.float16):
|
|
91
180
|
"""set weights for llm boost"""
|
|
181
|
+
self._convert_qkv_concat_weight(parm_dict)
|
|
92
182
|
embedding_weight_name = "model.tok_embeddings.embedding_weight"
|
|
93
183
|
attention_norm_name = "attention_norm"
|
|
94
184
|
qkv_name = "attention.w_qkv"
|
|
@@ -101,45 +191,88 @@ class AtbBoostBase():
|
|
|
101
191
|
placeholder = Parameter(Tensor(np.zeros(1), dtype=dtype))
|
|
102
192
|
|
|
103
193
|
ascend_weight = []
|
|
104
|
-
ascend_weight.append(
|
|
105
|
-
self.cast(parm_dict[embedding_weight_name], dtype))
|
|
194
|
+
ascend_weight.append(self.cast(parm_dict[embedding_weight_name], dtype))
|
|
106
195
|
for i in range(self.num_layers):
|
|
107
|
-
ascend_weight.append(
|
|
108
|
-
|
|
196
|
+
ascend_weight.append(
|
|
197
|
+
self._convert_tensor_format_and_dtype(
|
|
198
|
+
parm_dict[f"model.layers.{i}.{attention_norm_name}.weight"], dtype
|
|
199
|
+
)
|
|
200
|
+
)
|
|
109
201
|
ascend_weight.extend([placeholder] * 3)
|
|
110
202
|
|
|
111
203
|
ascend_weight.append(
|
|
112
|
-
self._convert_tensor_format_and_dtype(
|
|
113
|
-
|
|
114
|
-
|
|
204
|
+
self._convert_tensor_format_and_dtype(
|
|
205
|
+
parm_dict[f"model.layers.{i}.{qkv_name}.weight"], dtype
|
|
206
|
+
)
|
|
207
|
+
)
|
|
208
|
+
ascend_weight.append(
|
|
209
|
+
self._convert_tensor_format_and_dtype(
|
|
210
|
+
parm_dict.get(f"model.layers.{i}.{qkv_name}.bias", placeholder),
|
|
211
|
+
dtype,
|
|
212
|
+
)
|
|
213
|
+
)
|
|
115
214
|
ascend_weight.extend([placeholder] * 16)
|
|
116
215
|
|
|
117
216
|
ascend_weight.append(
|
|
118
|
-
self._convert_tensor_format_and_dtype(
|
|
119
|
-
|
|
120
|
-
|
|
217
|
+
self._convert_tensor_format_and_dtype(
|
|
218
|
+
parm_dict[f"model.layers.{i}.{o_name}.weight"], dtype
|
|
219
|
+
)
|
|
220
|
+
)
|
|
221
|
+
ascend_weight.append(
|
|
222
|
+
self._convert_tensor_format_and_dtype(
|
|
223
|
+
parm_dict.get(f"model.layers.{i}.{o_name}.bias", placeholder), dtype
|
|
224
|
+
)
|
|
225
|
+
)
|
|
121
226
|
ascend_weight.extend([placeholder] * 4)
|
|
122
227
|
|
|
123
228
|
ascend_weight.append(
|
|
124
|
-
self._convert_tensor_format_and_dtype(
|
|
229
|
+
self._convert_tensor_format_and_dtype(
|
|
230
|
+
parm_dict[f"model.layers.{i}.{mlp_norm_name}.weight"], dtype
|
|
231
|
+
)
|
|
232
|
+
)
|
|
125
233
|
ascend_weight.extend([placeholder] * 3)
|
|
126
234
|
|
|
127
235
|
ascend_weight.append(
|
|
128
|
-
self._convert_tensor_format_and_dtype(
|
|
129
|
-
|
|
130
|
-
|
|
236
|
+
self._convert_tensor_format_and_dtype(
|
|
237
|
+
parm_dict[f"model.layers.{i}.{mlp_gate_name}.weight"], dtype
|
|
238
|
+
)
|
|
239
|
+
)
|
|
240
|
+
ascend_weight.append(
|
|
241
|
+
self._convert_tensor_format_and_dtype(
|
|
242
|
+
parm_dict.get(
|
|
243
|
+
f"model.layers.{i}.{mlp_gate_name}.bias", placeholder
|
|
244
|
+
),
|
|
245
|
+
dtype,
|
|
246
|
+
)
|
|
247
|
+
)
|
|
131
248
|
ascend_weight.extend([placeholder] * 10)
|
|
132
249
|
|
|
133
250
|
ascend_weight.append(
|
|
134
|
-
self._convert_tensor_format_and_dtype(
|
|
135
|
-
|
|
136
|
-
|
|
251
|
+
self._convert_tensor_format_and_dtype(
|
|
252
|
+
parm_dict[f"model.layers.{i}.{mlp_down_name}.weight"], dtype
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
ascend_weight.append(
|
|
256
|
+
self._convert_tensor_format_and_dtype(
|
|
257
|
+
parm_dict.get(
|
|
258
|
+
f"model.layers.{i}.{mlp_down_name}.bias", placeholder
|
|
259
|
+
),
|
|
260
|
+
dtype,
|
|
261
|
+
)
|
|
262
|
+
)
|
|
137
263
|
ascend_weight.extend([placeholder] * 4)
|
|
138
264
|
|
|
139
265
|
ascend_weight.append(
|
|
140
|
-
self._convert_tensor_format_and_dtype(
|
|
266
|
+
self._convert_tensor_format_and_dtype(
|
|
267
|
+
parm_dict[f"{norm_out_name}.weight"], dtype
|
|
268
|
+
)
|
|
269
|
+
)
|
|
141
270
|
ascend_weight.append(
|
|
142
|
-
self._convert_tensor_format_and_dtype(
|
|
271
|
+
self._convert_tensor_format_and_dtype(
|
|
272
|
+
parm_dict[f"{lm_head_name}.weight"], dtype
|
|
273
|
+
)
|
|
274
|
+
)
|
|
275
|
+
self.ascend_weight = ascend_weight
|
|
143
276
|
self.atb_encoder_operation.set_weights(ascend_weight)
|
|
144
277
|
self.atb_decoder_operation.set_weights(ascend_weight)
|
|
145
278
|
|
|
@@ -147,20 +280,47 @@ class AtbBoostBase():
|
|
|
147
280
|
"""set kv_cache for llm boost"""
|
|
148
281
|
if not k_caches or v_caches:
|
|
149
282
|
if self.need_nz:
|
|
150
|
-
kv_shape = (
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
283
|
+
kv_shape = (
|
|
284
|
+
self.config.num_blocks,
|
|
285
|
+
self.num_kv_heads * self.head_dim // self.device_num // 16,
|
|
286
|
+
self.config.block_size,
|
|
287
|
+
16,
|
|
288
|
+
)
|
|
289
|
+
k_caches = [
|
|
290
|
+
_set_format(
|
|
291
|
+
Parameter(
|
|
292
|
+
Tensor(shape=kv_shape, dtype=self.dtype, init=Zero())
|
|
293
|
+
),
|
|
294
|
+
FORMAT_NZ,
|
|
295
|
+
)
|
|
296
|
+
for _ in range(self.num_layers)
|
|
297
|
+
]
|
|
298
|
+
v_caches = [
|
|
299
|
+
_set_format(
|
|
300
|
+
Parameter(
|
|
301
|
+
Tensor(shape=kv_shape, dtype=self.dtype, init=Zero())
|
|
302
|
+
),
|
|
303
|
+
FORMAT_NZ,
|
|
304
|
+
)
|
|
305
|
+
for _ in range(self.num_layers)
|
|
306
|
+
]
|
|
156
307
|
else:
|
|
157
|
-
kv_shape = (
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
308
|
+
kv_shape = (
|
|
309
|
+
self.config.num_blocks,
|
|
310
|
+
self.config.block_size,
|
|
311
|
+
self.num_kv_heads // self.device_num,
|
|
312
|
+
self.head_dim,
|
|
313
|
+
)
|
|
314
|
+
k_caches = [
|
|
315
|
+
Parameter(Tensor(shape=kv_shape, dtype=self.dtype, init=Zero()))
|
|
316
|
+
for _ in range(self.num_layers)
|
|
317
|
+
]
|
|
318
|
+
v_caches = [
|
|
319
|
+
Parameter(Tensor(shape=kv_shape, dtype=self.dtype, init=Zero()))
|
|
320
|
+
for _ in range(self.num_layers)
|
|
321
|
+
]
|
|
322
|
+
self.k_caches = k_caches
|
|
323
|
+
self.v_caches = v_caches
|
|
164
324
|
self.atb_encoder_operation.set_kvcache(k_caches, v_caches)
|
|
165
325
|
self.atb_decoder_operation.set_kvcache(k_caches, v_caches)
|
|
166
326
|
|
|
@@ -171,11 +331,9 @@ class AtbBoostBase():
|
|
|
171
331
|
def _execute_operator(self, acl_inputs, acl_param):
|
|
172
332
|
"""execute operator."""
|
|
173
333
|
if self.is_first_iteration:
|
|
174
|
-
acl_model_out = self.atb_encoder_operation.forward(
|
|
175
|
-
acl_inputs, acl_param)
|
|
334
|
+
acl_model_out = self.atb_encoder_operation.forward(acl_inputs, acl_param)
|
|
176
335
|
else:
|
|
177
|
-
acl_model_out = self.atb_decoder_operation.forward(
|
|
178
|
-
acl_inputs, acl_param)
|
|
336
|
+
acl_model_out = self.atb_decoder_operation.forward(acl_inputs, acl_param)
|
|
179
337
|
acl_hidden_state = acl_model_out[0]
|
|
180
338
|
return acl_hidden_state
|
|
181
339
|
|
|
@@ -183,28 +341,46 @@ class AtbBoostBase():
|
|
|
183
341
|
r"""
|
|
184
342
|
LlmBoost forward.
|
|
185
343
|
"""
|
|
186
|
-
input_ids = boost_inputs
|
|
187
|
-
position_ids = boost_inputs
|
|
188
|
-
cos_embed = boost_inputs
|
|
189
|
-
sin_embed = boost_inputs
|
|
190
|
-
block_tables = boost_inputs
|
|
191
|
-
slot_mapping = boost_inputs
|
|
192
|
-
batch_valid_length = boost_inputs
|
|
193
|
-
lm_head_indices = boost_inputs
|
|
194
|
-
seqLen = boost_inputs
|
|
344
|
+
input_ids = boost_inputs.get("input_ids", None)
|
|
345
|
+
position_ids = boost_inputs.get("position_ids", None)
|
|
346
|
+
cos_embed = boost_inputs.get("cos_embed", None)
|
|
347
|
+
sin_embed = boost_inputs.get("sin_embed", None)
|
|
348
|
+
block_tables = boost_inputs.get("block_tables", None)
|
|
349
|
+
slot_mapping = boost_inputs.get("slot_mapping", None)
|
|
350
|
+
batch_valid_length = boost_inputs.get("batch_valid_length", None)
|
|
351
|
+
lm_head_indices = boost_inputs.get("lm_head_indices", None)
|
|
352
|
+
seqLen = boost_inputs.get("seq_lens", None)
|
|
353
|
+
input_ids = self.reshape(input_ids, (-1,))
|
|
195
354
|
if self.is_first_iteration:
|
|
196
355
|
attention_mask = self.attn_mask
|
|
197
356
|
else:
|
|
198
|
-
position_ids
|
|
357
|
+
if position_ids is None:
|
|
358
|
+
position_ids = batch_valid_length - 1
|
|
199
359
|
attention_mask = self.placeholder
|
|
200
360
|
lm_head_indices = self.lm_head_indices_fake
|
|
201
361
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
362
|
+
if input_ids is not None and input_ids.dtype != mstype.int64:
|
|
363
|
+
input_ids = self.cast(input_ids, mstype.int64)
|
|
364
|
+
if position_ids is not None and position_ids.dtype != mstype.int64:
|
|
365
|
+
position_ids = self.cast(position_ids, mstype.int64)
|
|
366
|
+
if batch_valid_length is not None and batch_valid_length.dtype != mstype.int32:
|
|
367
|
+
batch_valid_length = self.cast(batch_valid_length, mstype.int32)
|
|
368
|
+
if lm_head_indices is not None and lm_head_indices.dtype != mstype.int64:
|
|
369
|
+
lm_head_indices = self.cast(lm_head_indices, mstype.int64)
|
|
370
|
+
|
|
371
|
+
acl_inputs, acl_param = self._prepare_inputs(
|
|
372
|
+
prefill=self.is_first_iteration,
|
|
373
|
+
input_ids=input_ids,
|
|
374
|
+
position_ids=position_ids,
|
|
375
|
+
cos_embed=cos_embed,
|
|
376
|
+
sin_embed=sin_embed,
|
|
377
|
+
attention_mask=attention_mask,
|
|
378
|
+
block_tables=block_tables,
|
|
379
|
+
slots=slot_mapping,
|
|
380
|
+
input_lengths=batch_valid_length,
|
|
381
|
+
lm_head_indices=lm_head_indices,
|
|
382
|
+
seqLen=seqLen,
|
|
383
|
+
)
|
|
208
384
|
ms.hal.synchronize()
|
|
209
385
|
logits = self._execute_operator(acl_inputs, acl_param)
|
|
210
386
|
logits = self.cast(logits, mstype.float32)
|
|
@@ -15,10 +15,16 @@
|
|
|
15
15
|
"""llm boost"""
|
|
16
16
|
import json
|
|
17
17
|
import mindspore.common.dtype as mstype
|
|
18
|
-
from mindspore.experimental.llm_boost.atb.boost_base import
|
|
18
|
+
from mindspore.experimental.llm_boost.atb.boost_base import (
|
|
19
|
+
AtbBoostBase,
|
|
20
|
+
PositionEmbeddingType,
|
|
21
|
+
NormType,
|
|
22
|
+
)
|
|
19
23
|
from mindspore._c_expression import LlmBoostBinder
|
|
20
24
|
from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
|
|
21
25
|
|
|
26
|
+
CPP_LLAMA_MODEL_CLASS_NAME = "llama_LlamaDecoderModel"
|
|
27
|
+
|
|
22
28
|
|
|
23
29
|
@LlmBoostRegister.register(LlmBoostType.BUILDIN, "Llama")
|
|
24
30
|
class LlamaBoost(AtbBoostBase):
|
|
@@ -30,14 +36,17 @@ class LlamaBoost(AtbBoostBase):
|
|
|
30
36
|
self.acl_encoder_operation_inputs = [None] * self.in_tensor_length
|
|
31
37
|
self.acl_decoder_operation_inputs = [None] * self.in_tensor_length
|
|
32
38
|
self.atb_encoder_operation = LlmBoostBinder(
|
|
33
|
-
|
|
39
|
+
self.backend_name, CPP_LLAMA_MODEL_CLASS_NAME
|
|
40
|
+
)
|
|
34
41
|
self.atb_decoder_operation = LlmBoostBinder(
|
|
35
|
-
|
|
42
|
+
self.backend_name, CPP_LLAMA_MODEL_CLASS_NAME
|
|
43
|
+
)
|
|
36
44
|
|
|
37
45
|
def init(self):
|
|
38
46
|
"""set param"""
|
|
39
47
|
coder_param = {
|
|
40
|
-
"
|
|
48
|
+
"normEps": self.config.rms_norm_eps,
|
|
49
|
+
"normType": NormType.RMS_NORM,
|
|
41
50
|
"numAttentionHeadsPerRank": self.config.num_heads // self.device_num,
|
|
42
51
|
"hiddenSizePerAttentionHead": self.head_dim,
|
|
43
52
|
"numHiddenLayers": self.num_layers,
|
|
@@ -46,32 +55,41 @@ class LlamaBoost(AtbBoostBase):
|
|
|
46
55
|
"isFA": False,
|
|
47
56
|
"isBF16": self.dtype == mstype.bfloat16,
|
|
48
57
|
"packQuantType": [[1, 1] for _ in range(self.num_layers)],
|
|
49
|
-
"linearQuantType": [
|
|
50
|
-
|
|
58
|
+
"linearQuantType": [
|
|
59
|
+
[0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)
|
|
60
|
+
],
|
|
61
|
+
"linearTransposeType": [
|
|
62
|
+
[1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)
|
|
63
|
+
],
|
|
51
64
|
"isEmbeddingParallel": False,
|
|
52
65
|
"isLmHeadParallel": not self.config.parallel_config.vocab_emb_dp,
|
|
53
66
|
"lmHeadTransposeType": 1,
|
|
54
|
-
"
|
|
55
|
-
"
|
|
67
|
+
"enableSwiGLU": True,
|
|
68
|
+
"enablekvQuant": self.kv_quant is not None,
|
|
56
69
|
"rank": self.rank_id,
|
|
57
70
|
"worldSize": self.device_num,
|
|
58
|
-
"backend":
|
|
71
|
+
"backend": self.config.communication_backend,
|
|
59
72
|
"rankTableFile": "",
|
|
60
|
-
"positionEmbeddingType":
|
|
73
|
+
"positionEmbeddingType": PositionEmbeddingType.ROPE,
|
|
61
74
|
"hiddenSize": self.config.hidden_size,
|
|
62
75
|
"gemma": False,
|
|
63
|
-
"enableAddNorm":
|
|
64
|
-
"
|
|
76
|
+
"enableAddNorm": False,
|
|
77
|
+
"enableCompressHead": False,
|
|
78
|
+
"isUnpadInputs": True,
|
|
65
79
|
}
|
|
66
80
|
encoder_param = {
|
|
67
|
-
**coder_param,
|
|
68
|
-
"
|
|
69
|
-
"
|
|
70
|
-
"
|
|
81
|
+
**coder_param,
|
|
82
|
+
"isPrefill": True,
|
|
83
|
+
"enableLcoc": True,
|
|
84
|
+
"enableSpeculate": False,
|
|
85
|
+
"skipWordEmbedding": False,
|
|
86
|
+
"enableSplitFuse": False,
|
|
71
87
|
}
|
|
72
88
|
decoder_param = {
|
|
73
|
-
**coder_param,
|
|
74
|
-
"
|
|
89
|
+
**coder_param,
|
|
90
|
+
"isPrefill": False,
|
|
91
|
+
"enableLcoc": False,
|
|
92
|
+
"enableSpeculate": False,
|
|
75
93
|
}
|
|
76
94
|
self.atb_encoder_operation.init(json.dumps({**encoder_param}))
|
|
77
95
|
self.atb_decoder_operation.init(json.dumps({**decoder_param}))
|
|
@@ -92,14 +110,15 @@ class LlamaBoost(AtbBoostBase):
|
|
|
92
110
|
**kwargs
|
|
93
111
|
):
|
|
94
112
|
"""prepare inputs"""
|
|
95
|
-
self.acl_param = json.dumps(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
113
|
+
self.acl_param = json.dumps(
|
|
114
|
+
{
|
|
115
|
+
"seqLen": seqLen,
|
|
116
|
+
}
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
self.acl_decoder_operation_inputs[0] = input_ids
|
|
100
120
|
self.acl_decoder_operation_inputs[1] = self.placeholder
|
|
101
|
-
self.acl_decoder_operation_inputs[2] =
|
|
102
|
-
position_ids, mstype.int32)
|
|
121
|
+
self.acl_decoder_operation_inputs[2] = position_ids
|
|
103
122
|
self.acl_decoder_operation_inputs[3] = cos_embed
|
|
104
123
|
self.acl_decoder_operation_inputs[4] = sin_embed
|
|
105
124
|
self.acl_decoder_operation_inputs[5] = attention_mask
|
|
@@ -108,8 +127,6 @@ class LlamaBoost(AtbBoostBase):
|
|
|
108
127
|
self.acl_decoder_operation_inputs[8] = self.placeholder
|
|
109
128
|
self.acl_decoder_operation_inputs[9] = self.placeholder
|
|
110
129
|
self.acl_decoder_operation_inputs[10] = self.placeholder
|
|
111
|
-
self.acl_decoder_operation_inputs[11] =
|
|
112
|
-
|
|
113
|
-
self.acl_decoder_operation_inputs[12] = self.cast(
|
|
114
|
-
lm_head_indices, mstype.int64)
|
|
130
|
+
self.acl_decoder_operation_inputs[11] = input_lengths
|
|
131
|
+
self.acl_decoder_operation_inputs[12] = lm_head_indices
|
|
115
132
|
return self.acl_decoder_operation_inputs, self.acl_param
|
|
@@ -15,11 +15,14 @@
|
|
|
15
15
|
"""llm boost"""
|
|
16
16
|
import json
|
|
17
17
|
import mindspore.common.dtype as mstype
|
|
18
|
-
from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase
|
|
18
|
+
from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase, NormType
|
|
19
19
|
from mindspore._c_expression import LlmBoostBinder
|
|
20
20
|
from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
CPP_QWEN_MODEL_CLASS_NAME = "qwen_QwenDecoderModel"
|
|
24
|
+
|
|
25
|
+
|
|
23
26
|
@LlmBoostRegister.register(LlmBoostType.BUILDIN, "Qwen")
|
|
24
27
|
class QwenBoost(AtbBoostBase):
|
|
25
28
|
"""QwenBoost class"""
|
|
@@ -30,9 +33,11 @@ class QwenBoost(AtbBoostBase):
|
|
|
30
33
|
self.acl_encoder_operation_inputs = [None] * self.in_tensor_length
|
|
31
34
|
self.acl_decoder_operation_inputs = [None] * self.in_tensor_length
|
|
32
35
|
self.atb_encoder_operation = LlmBoostBinder(
|
|
33
|
-
|
|
36
|
+
self.backend_name, CPP_QWEN_MODEL_CLASS_NAME
|
|
37
|
+
)
|
|
34
38
|
self.atb_decoder_operation = LlmBoostBinder(
|
|
35
|
-
|
|
39
|
+
self.backend_name, CPP_QWEN_MODEL_CLASS_NAME
|
|
40
|
+
)
|
|
36
41
|
|
|
37
42
|
def init(self):
|
|
38
43
|
"""set param"""
|
|
@@ -42,24 +47,43 @@ class QwenBoost(AtbBoostBase):
|
|
|
42
47
|
"withEmbedding": True,
|
|
43
48
|
"isEmbeddingParallel": True,
|
|
44
49
|
"isLmHeadParallel": True,
|
|
45
|
-
"linearTransposeType": [
|
|
50
|
+
"linearTransposeType": [
|
|
51
|
+
[1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)
|
|
52
|
+
],
|
|
46
53
|
"lmHeadTransposeType": 1,
|
|
47
|
-
"
|
|
48
|
-
"
|
|
54
|
+
"enableSwiGLU": not self.need_nz,
|
|
55
|
+
"normEps": self.config.rms_norm_eps,
|
|
56
|
+
"normType": NormType.RMS_NORM,
|
|
49
57
|
"numAttentionHeadsPerRank": self.config.num_heads // self.device_num,
|
|
50
58
|
"hiddenSizePerAttentionHead": self.head_dim,
|
|
51
59
|
"numHiddenLayers": self.num_layers,
|
|
52
60
|
"numKeyValueHeadsPerRank": self.n_kv_heads // self.device_num,
|
|
53
61
|
"rank": self.rank_id,
|
|
54
62
|
"worldSize": self.device_num,
|
|
55
|
-
"backend":
|
|
63
|
+
"backend": self.config.communication_backend,
|
|
56
64
|
"packQuantType": [[1, 1] for _ in range(self.num_layers)],
|
|
57
|
-
"linearQuantType": [
|
|
58
|
-
|
|
65
|
+
"linearQuantType": [
|
|
66
|
+
[0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)
|
|
67
|
+
],
|
|
68
|
+
"linearHasBias": [[True, False, False, False]] * self.num_layers,
|
|
69
|
+
"enableKvQuant": self.kv_quant is not None,
|
|
70
|
+
"enableLora": False,
|
|
71
|
+
"isUnpadInputs": True,
|
|
72
|
+
"enableAddNorm": False,
|
|
73
|
+
}
|
|
74
|
+
encoder_param = {
|
|
75
|
+
**param_dict,
|
|
76
|
+
"isPrefill": True,
|
|
77
|
+
"enableLcoc": False,
|
|
78
|
+
"enableSplitFuse": False,
|
|
79
|
+
}
|
|
80
|
+
decoder_param = {
|
|
81
|
+
**param_dict,
|
|
82
|
+
"isPrefill": False,
|
|
83
|
+
"enableLcoc": False,
|
|
84
|
+
"enableSpeculate": False,
|
|
85
|
+
"enablePrefixCache": False,
|
|
59
86
|
}
|
|
60
|
-
encoder_param = {**param_dict, "isPrefill": True, "supportLcoc": False}
|
|
61
|
-
decoder_param = {**param_dict, "isPrefill": False,
|
|
62
|
-
"supportLcoc": False, "supportSpeculate": False}
|
|
63
87
|
self.atb_encoder_operation.init(json.dumps({**encoder_param}))
|
|
64
88
|
self.atb_decoder_operation.init(json.dumps({**decoder_param}))
|
|
65
89
|
|
|
@@ -79,13 +103,14 @@ class QwenBoost(AtbBoostBase):
|
|
|
79
103
|
**kwargs
|
|
80
104
|
):
|
|
81
105
|
"""prepare inputs"""
|
|
82
|
-
self.acl_param = json.dumps(
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
106
|
+
self.acl_param = json.dumps(
|
|
107
|
+
{
|
|
108
|
+
"seqLen": seqLen,
|
|
109
|
+
}
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
self.acl_decoder_operation_inputs[0] = input_ids
|
|
113
|
+
self.acl_decoder_operation_inputs[1] = position_ids
|
|
89
114
|
self.acl_decoder_operation_inputs[2] = cos_embed
|
|
90
115
|
self.acl_decoder_operation_inputs[3] = sin_embed
|
|
91
116
|
self.acl_decoder_operation_inputs[4] = attention_mask
|
|
@@ -93,9 +118,7 @@ class QwenBoost(AtbBoostBase):
|
|
|
93
118
|
self.acl_decoder_operation_inputs[6] = slots
|
|
94
119
|
self.acl_decoder_operation_inputs[7] = self.placeholder
|
|
95
120
|
self.acl_decoder_operation_inputs[8] = self.placeholder
|
|
96
|
-
self.acl_decoder_operation_inputs[9] = self.
|
|
97
|
-
|
|
98
|
-
self.acl_decoder_operation_inputs[
|
|
99
|
-
lm_head_indices, mstype.int64)
|
|
100
|
-
self.acl_decoder_operation_inputs[11] = self.placeholder
|
|
121
|
+
self.acl_decoder_operation_inputs[9] = self.placeholder
|
|
122
|
+
self.acl_decoder_operation_inputs[10] = input_lengths
|
|
123
|
+
self.acl_decoder_operation_inputs[11] = lm_head_indices
|
|
101
124
|
return self.acl_decoder_operation_inputs, self.acl_param
|