mindspore 2.4.0__cp39-cp39-win_amd64.whl → 2.4.10__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (87) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/_c_dataengine.cp39-win_amd64.pyd +0 -0
  3. mindspore/_c_expression.cp39-win_amd64.pyd +0 -0
  4. mindspore/_c_mindrecord.cp39-win_amd64.pyd +0 -0
  5. mindspore/avcodec-59.dll +0 -0
  6. mindspore/avdevice-59.dll +0 -0
  7. mindspore/avfilter-8.dll +0 -0
  8. mindspore/avformat-59.dll +0 -0
  9. mindspore/avutil-57.dll +0 -0
  10. mindspore/common/api.py +1 -4
  11. mindspore/common/file_system.py +2 -0
  12. mindspore/common/initializer.py +51 -15
  13. mindspore/common/parameter.py +6 -5
  14. mindspore/common/tensor.py +15 -49
  15. mindspore/communication/_comm_helper.py +5 -0
  16. mindspore/communication/comm_func.py +7 -7
  17. mindspore/context.py +16 -2
  18. mindspore/dataset/engine/datasets_standard_format.py +17 -0
  19. mindspore/dataset/engine/datasets_user_defined.py +27 -1
  20. mindspore/dnnl.dll +0 -0
  21. mindspore/experimental/llm_boost/__init__.py +2 -2
  22. mindspore/experimental/llm_boost/atb/boost_base.py +240 -64
  23. mindspore/experimental/llm_boost/atb/llama_boost.py +46 -29
  24. mindspore/experimental/llm_boost/atb/qwen_boost.py +47 -24
  25. mindspore/include/api/context.h +1 -1
  26. mindspore/include/dataset/constants.h +2 -2
  27. mindspore/jpeg62.dll +0 -0
  28. mindspore/mindspore_backend.dll +0 -0
  29. mindspore/mindspore_common.dll +0 -0
  30. mindspore/mindspore_core.dll +0 -0
  31. mindspore/mindspore_glog.dll +0 -0
  32. mindspore/mindspore_np_dtype.dll +0 -0
  33. mindspore/mindspore_ops.dll +0 -0
  34. mindspore/mint/__init__.py +490 -2
  35. mindspore/mint/nn/__init__.py +2 -2
  36. mindspore/mint/optim/adamw.py +6 -14
  37. mindspore/nn/__init__.py +2 -0
  38. mindspore/nn/cell.py +16 -4
  39. mindspore/nn/layer/basic.py +24 -7
  40. mindspore/nn/layer/conv.py +3 -0
  41. mindspore/nn/layer/embedding.py +31 -14
  42. mindspore/nn/layer/pooling.py +8 -10
  43. mindspore/nn/optim/tft_wrapper.py +12 -15
  44. mindspore/nn/utils/__init__.py +22 -0
  45. mindspore/nn/utils/init.py +71 -0
  46. mindspore/opencv_core452.dll +0 -0
  47. mindspore/opencv_imgcodecs452.dll +0 -0
  48. mindspore/opencv_imgproc452.dll +0 -0
  49. mindspore/ops/_grad_experimental/grad_array_ops.py +0 -11
  50. mindspore/ops/_grad_experimental/grad_comm_ops.py +45 -8
  51. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +6 -0
  52. mindspore/ops/auto_generate/gen_extend_func.py +33 -0
  53. mindspore/ops/auto_generate/gen_ops_def.py +52 -3
  54. mindspore/ops/auto_generate/gen_ops_prim.py +158 -8
  55. mindspore/ops/function/array_func.py +2 -0
  56. mindspore/ops/function/math_func.py +12 -5
  57. mindspore/ops/function/random_func.py +221 -7
  58. mindspore/ops/operations/__init__.py +1 -1
  59. mindspore/ops/operations/array_ops.py +3 -1
  60. mindspore/ops/operations/comm_ops.py +25 -1
  61. mindspore/ops/operations/custom_ops.py +6 -4
  62. mindspore/ops/operations/manually_defined/ops_def.py +8 -10
  63. mindspore/ops/operations/nn_ops.py +7 -2
  64. mindspore/parallel/_auto_parallel_context.py +26 -5
  65. mindspore/parallel/_cell_wrapper.py +24 -3
  66. mindspore/parallel/_tensor.py +46 -2
  67. mindspore/parallel/_utils.py +39 -21
  68. mindspore/parallel/transform_safetensors.py +196 -43
  69. mindspore/profiler/profiling.py +5 -1
  70. mindspore/run_check/_check_version.py +20 -9
  71. mindspore/swresample-4.dll +0 -0
  72. mindspore/swscale-6.dll +0 -0
  73. mindspore/tinyxml2.dll +0 -0
  74. mindspore/train/_utils.py +92 -32
  75. mindspore/train/callback/_checkpoint.py +12 -9
  76. mindspore/train/callback/_on_request_exit.py +12 -1
  77. mindspore/train/callback/_tft_register.py +33 -9
  78. mindspore/train/dataset_helper.py +10 -2
  79. mindspore/train/model.py +21 -0
  80. mindspore/train/serialization.py +12 -19
  81. mindspore/turbojpeg.dll +0 -0
  82. mindspore/version.py +1 -1
  83. {mindspore-2.4.0.dist-info → mindspore-2.4.10.dist-info}/METADATA +9 -7
  84. {mindspore-2.4.0.dist-info → mindspore-2.4.10.dist-info}/RECORD +87 -85
  85. {mindspore-2.4.0.dist-info → mindspore-2.4.10.dist-info}/WHEEL +1 -1
  86. {mindspore-2.4.0.dist-info → mindspore-2.4.10.dist-info}/entry_points.txt +0 -0
  87. {mindspore-2.4.0.dist-info → mindspore-2.4.10.dist-info}/top_level.txt +0 -0
@@ -13,17 +13,32 @@
13
13
  # limitations under the License.
14
14
  # ============================================================================
15
15
  """boost base class"""
16
+ from enum import Enum
16
17
  import numpy as np
17
18
  import mindspore as ms
18
19
  from mindspore import ops, Tensor
20
+ from mindspore import log as logger
19
21
  from mindspore.ops import operations as P
20
22
  import mindspore.common.dtype as mstype
21
23
  from mindspore._c_expression import _set_format
22
-
23
24
  from mindspore.common.parameter import Parameter
24
25
  from mindspore.experimental.llm_boost.utils import get_real_rank, get_real_group_size
25
26
  from mindspore.common.initializer import Zero
26
27
 
28
+ FORMAT_NZ = "FRACTAL_NZ"
29
+ BUILDIN_BACKEND_NAME = "ATB"
30
+
31
+
32
+ class PositionEmbeddingType(int, Enum):
33
+ ROPE = 0
34
+ ALIBI = 1
35
+ ABSOLUTE = 2
36
+
37
+
38
+ class NormType(int, Enum):
39
+ RMS_NORM = 0
40
+ LAYER_NORM = 1
41
+
27
42
 
28
43
  class AttentionMask:
29
44
  """attention mask"""
@@ -31,30 +46,34 @@ class AttentionMask:
31
46
  @classmethod
32
47
  def static(cls, max_seq_len, dtype=mstype.float16, need_nz=False):
33
48
  """cache mask"""
34
- bias_cache = Tensor(np.tril(np.ones((max_seq_len, max_seq_len), dtype=np.bool_))).reshape(max_seq_len,
35
- max_seq_len)
49
+ bias_cache = Tensor(
50
+ np.tril(np.ones((max_seq_len, max_seq_len), dtype=np.bool_))
51
+ ).reshape(max_seq_len, max_seq_len)
36
52
  bias_cache = ~bias_cache
37
53
  if dtype == mstype.float16:
38
54
  mask_value = Tensor(np.finfo(np.float32).min, mstype.float16)
39
55
  else:
40
56
  mask_value = Tensor(1)
41
- attn_mask = ops.masked_fill(Tensor(np.zeros(
42
- (max_seq_len, max_seq_len)), dtype=mstype.float16), bias_cache, mask_value)
57
+ attn_mask = ops.masked_fill(
58
+ Tensor(np.zeros((max_seq_len, max_seq_len)), dtype=mstype.float16),
59
+ bias_cache,
60
+ mask_value,
61
+ )
43
62
  if need_nz:
44
63
  # ND -> NZ
45
64
  attn_mask = ops.reshape(attn_mask, (1, max_seq_len, max_seq_len))
46
- attn_mask = ops.reshape(
47
- attn_mask, (1, max_seq_len, max_seq_len // 16, 16))
65
+ attn_mask = ops.reshape(attn_mask, (1, max_seq_len, max_seq_len // 16, 16))
48
66
  attn_mask = ops.transpose(attn_mask, (0, 2, 1, 3)).contiguous()
49
- attn_mask = _set_format(attn_mask, "FRACTAL_NZ")
67
+ attn_mask = _set_format(attn_mask, FORMAT_NZ)
50
68
  return attn_mask
51
69
 
52
70
 
53
- class AtbBoostBase():
71
+ class AtbBoostBase:
54
72
  """atb boost base class"""
55
73
 
56
74
  def __init__(self, config):
57
75
  super().__init__()
76
+ self.backend_name = BUILDIN_BACKEND_NAME
58
77
  self.is_first_iteration = False
59
78
  self.config = config
60
79
  self.dtype = config.compute_dtype
@@ -68,27 +87,98 @@ class AtbBoostBase():
68
87
  self.need_nz = config.need_nz
69
88
  self.placeholder = Tensor(np.zeros(1), dtype=self.dtype)
70
89
  self.lm_head_indices_fake = Tensor([0], dtype=mstype.int64)
71
- self.position_embedding_type = "ROPE"
90
+ self.position_embedding_type = PositionEmbeddingType.ROPE
72
91
  self.add_norm_enable = True
73
92
  self.max_decode_length = self.config.max_decode_length
74
93
  self.max_base_len = 128
75
94
  self.attn_mask = AttentionMask.static(
76
- self.max_base_len, dtype=self.dtype, need_nz=self.need_nz)
95
+ self.max_base_len, dtype=self.dtype, need_nz=self.need_nz
96
+ )
77
97
 
78
98
  self.cast = P.Cast()
79
99
  self.reshape = P.Reshape()
80
100
  self.kv_quant = None
81
101
  self.rank_id = get_real_rank()
82
102
  self.device_num = get_real_group_size()
103
+ self.ascend_weight = []
104
+ self.k_caches = []
105
+ self.v_caches = []
83
106
 
84
107
  def _convert_tensor_format_and_dtype(self, tensor, dtype=mstype.float16):
85
108
  tensor = self.cast(tensor, dtype=dtype)
86
109
  if self.need_nz:
87
- tensor = _set_format(tensor, "FRACTAL_NZ")
110
+ tensor = _set_format(tensor, FORMAT_NZ)
88
111
  return tensor
89
112
 
113
+ def _convert_qkv_concat_weight(self, param_dict):
114
+ """convert qkv concat weight"""
115
+ assume_num_layers = 500
116
+ for i in range(assume_num_layers):
117
+ # qkv weight concat
118
+ wq_weight_name = f"model.layers.{i}.attention.wq.weight"
119
+ wk_weight_name = f"model.layers.{i}.attention.wk.weight"
120
+ wv_weight_name = f"model.layers.{i}.attention.wv.weight"
121
+ qkv_concat_weight_name = f"model.layers.{i}.attention.w_qkv.weight"
122
+ if wq_weight_name not in param_dict:
123
+ break
124
+ wq_weight = param_dict[wq_weight_name].asnumpy()
125
+ wk_weight = param_dict[wk_weight_name].asnumpy()
126
+ wv_weight = param_dict[wv_weight_name].asnumpy()
127
+ qkv_weight = np.concatenate((wq_weight, wk_weight, wv_weight), 0)
128
+ param_dict[qkv_concat_weight_name] = Parameter(
129
+ qkv_weight, name=qkv_concat_weight_name
130
+ )
131
+
132
+ # gate hidden weight concat
133
+ ffn_gate_weight_name = f"model.layers.{i}.feed_forward.w1.weight"
134
+ ffn_hidden_weight_name = f"model.layers.{i}.feed_forward.w3.weight"
135
+ gate_hidden_concat_weight_name = (
136
+ f"model.layers.{i}.feed_forward.w_gate_hidden.weight"
137
+ )
138
+
139
+ ffn_gate_weight = param_dict[ffn_gate_weight_name].asnumpy()
140
+ ffn_hidden_weight = param_dict[ffn_hidden_weight_name].asnumpy()
141
+ gate_hidden_weight = np.concatenate((ffn_gate_weight, ffn_hidden_weight), 0)
142
+ param_dict[gate_hidden_concat_weight_name] = Parameter(
143
+ gate_hidden_weight, name=gate_hidden_concat_weight_name
144
+ )
145
+
146
+ param_dict.pop(wq_weight_name)
147
+ param_dict.pop(wk_weight_name)
148
+ param_dict.pop(wv_weight_name)
149
+ param_dict.pop(ffn_gate_weight_name)
150
+ param_dict.pop(ffn_hidden_weight_name)
151
+ logger.info(f"transform: {qkv_concat_weight_name}")
152
+ logger.info(f"transform: {gate_hidden_concat_weight_name}")
153
+
154
+ for i in range(assume_num_layers):
155
+ # qkv bias concat
156
+ wq_bias_name = f"model.layers.{i}.attention.wq.bias"
157
+ wk_bias_name = f"model.layers.{i}.attention.wk.bias"
158
+ wv_bias_name = f"model.layers.{i}.attention.wv.bias"
159
+ qkv_concat_bias_name = f"model.layers.{i}.attention.w_qkv.bias"
160
+ if wq_bias_name not in param_dict:
161
+ break
162
+
163
+ wq_bias_weight = param_dict[wq_bias_name].asnumpy()
164
+ wk_bias_weight = param_dict[wk_bias_name].asnumpy()
165
+ wv_bias_weight = param_dict[wv_bias_name].asnumpy()
166
+ qkv_bias_weight = np.concatenate(
167
+ (wq_bias_weight, wk_bias_weight, wv_bias_weight), 0
168
+ )
169
+ param_dict[qkv_concat_bias_name] = Parameter(
170
+ qkv_bias_weight, name=qkv_concat_bias_name
171
+ )
172
+
173
+ param_dict.pop(wq_bias_name)
174
+ param_dict.pop(wk_bias_name)
175
+ param_dict.pop(wv_bias_name)
176
+ logger.info(f"transform: {qkv_concat_bias_name}")
177
+ return param_dict
178
+
90
179
  def set_weights(self, parm_dict, dtype=mstype.float16):
91
180
  """set weights for llm boost"""
181
+ self._convert_qkv_concat_weight(parm_dict)
92
182
  embedding_weight_name = "model.tok_embeddings.embedding_weight"
93
183
  attention_norm_name = "attention_norm"
94
184
  qkv_name = "attention.w_qkv"
@@ -101,45 +191,88 @@ class AtbBoostBase():
101
191
  placeholder = Parameter(Tensor(np.zeros(1), dtype=dtype))
102
192
 
103
193
  ascend_weight = []
104
- ascend_weight.append(
105
- self.cast(parm_dict[embedding_weight_name], dtype))
194
+ ascend_weight.append(self.cast(parm_dict[embedding_weight_name], dtype))
106
195
  for i in range(self.num_layers):
107
- ascend_weight.append(self._convert_tensor_format_and_dtype(
108
- parm_dict[f"model.layers.{i}.{attention_norm_name}.weight"], dtype))
196
+ ascend_weight.append(
197
+ self._convert_tensor_format_and_dtype(
198
+ parm_dict[f"model.layers.{i}.{attention_norm_name}.weight"], dtype
199
+ )
200
+ )
109
201
  ascend_weight.extend([placeholder] * 3)
110
202
 
111
203
  ascend_weight.append(
112
- self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{qkv_name}.weight"], dtype))
113
- ascend_weight.append(self._convert_tensor_format_and_dtype(parm_dict.get(
114
- f"model.layers.{i}.{qkv_name}.bias", placeholder), dtype))
204
+ self._convert_tensor_format_and_dtype(
205
+ parm_dict[f"model.layers.{i}.{qkv_name}.weight"], dtype
206
+ )
207
+ )
208
+ ascend_weight.append(
209
+ self._convert_tensor_format_and_dtype(
210
+ parm_dict.get(f"model.layers.{i}.{qkv_name}.bias", placeholder),
211
+ dtype,
212
+ )
213
+ )
115
214
  ascend_weight.extend([placeholder] * 16)
116
215
 
117
216
  ascend_weight.append(
118
- self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{o_name}.weight"], dtype))
119
- ascend_weight.append(self._convert_tensor_format_and_dtype(parm_dict.get(
120
- f"model.layers.{i}.{o_name}.bias", placeholder), dtype))
217
+ self._convert_tensor_format_and_dtype(
218
+ parm_dict[f"model.layers.{i}.{o_name}.weight"], dtype
219
+ )
220
+ )
221
+ ascend_weight.append(
222
+ self._convert_tensor_format_and_dtype(
223
+ parm_dict.get(f"model.layers.{i}.{o_name}.bias", placeholder), dtype
224
+ )
225
+ )
121
226
  ascend_weight.extend([placeholder] * 4)
122
227
 
123
228
  ascend_weight.append(
124
- self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{mlp_norm_name}.weight"], dtype))
229
+ self._convert_tensor_format_and_dtype(
230
+ parm_dict[f"model.layers.{i}.{mlp_norm_name}.weight"], dtype
231
+ )
232
+ )
125
233
  ascend_weight.extend([placeholder] * 3)
126
234
 
127
235
  ascend_weight.append(
128
- self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{mlp_gate_name}.weight"], dtype))
129
- ascend_weight.append(self._convert_tensor_format_and_dtype(parm_dict.get(
130
- f"model.layers.{i}.{mlp_gate_name}.bias", placeholder), dtype))
236
+ self._convert_tensor_format_and_dtype(
237
+ parm_dict[f"model.layers.{i}.{mlp_gate_name}.weight"], dtype
238
+ )
239
+ )
240
+ ascend_weight.append(
241
+ self._convert_tensor_format_and_dtype(
242
+ parm_dict.get(
243
+ f"model.layers.{i}.{mlp_gate_name}.bias", placeholder
244
+ ),
245
+ dtype,
246
+ )
247
+ )
131
248
  ascend_weight.extend([placeholder] * 10)
132
249
 
133
250
  ascend_weight.append(
134
- self._convert_tensor_format_and_dtype(parm_dict[f"model.layers.{i}.{mlp_down_name}.weight"], dtype))
135
- ascend_weight.append(self._convert_tensor_format_and_dtype(parm_dict.get(
136
- f"model.layers.{i}.{mlp_down_name}.bias", placeholder), dtype))
251
+ self._convert_tensor_format_and_dtype(
252
+ parm_dict[f"model.layers.{i}.{mlp_down_name}.weight"], dtype
253
+ )
254
+ )
255
+ ascend_weight.append(
256
+ self._convert_tensor_format_and_dtype(
257
+ parm_dict.get(
258
+ f"model.layers.{i}.{mlp_down_name}.bias", placeholder
259
+ ),
260
+ dtype,
261
+ )
262
+ )
137
263
  ascend_weight.extend([placeholder] * 4)
138
264
 
139
265
  ascend_weight.append(
140
- self._convert_tensor_format_and_dtype(parm_dict[f"{norm_out_name}.weight"], dtype))
266
+ self._convert_tensor_format_and_dtype(
267
+ parm_dict[f"{norm_out_name}.weight"], dtype
268
+ )
269
+ )
141
270
  ascend_weight.append(
142
- self._convert_tensor_format_and_dtype(parm_dict[f"{lm_head_name}.weight"], dtype))
271
+ self._convert_tensor_format_and_dtype(
272
+ parm_dict[f"{lm_head_name}.weight"], dtype
273
+ )
274
+ )
275
+ self.ascend_weight = ascend_weight
143
276
  self.atb_encoder_operation.set_weights(ascend_weight)
144
277
  self.atb_decoder_operation.set_weights(ascend_weight)
145
278
 
@@ -147,20 +280,47 @@ class AtbBoostBase():
147
280
  """set kv_cache for llm boost"""
148
281
  if not k_caches or v_caches:
149
282
  if self.need_nz:
150
- kv_shape = (self.config.num_blocks, self.num_kv_heads*self.head_dim //
151
- self.device_num // 16, self.config.block_size, 16)
152
- k_caches = [_set_format(Parameter(Tensor(
153
- shape=kv_shape, dtype=self.dtype, init=Zero())), "FRACTAL_NZ") for _ in range(self.num_layers)]
154
- v_caches = [_set_format(Parameter(Tensor(
155
- shape=kv_shape, dtype=self.dtype, init=Zero())), "FRACTAL_NZ") for _ in range(self.num_layers)]
283
+ kv_shape = (
284
+ self.config.num_blocks,
285
+ self.num_kv_heads * self.head_dim // self.device_num // 16,
286
+ self.config.block_size,
287
+ 16,
288
+ )
289
+ k_caches = [
290
+ _set_format(
291
+ Parameter(
292
+ Tensor(shape=kv_shape, dtype=self.dtype, init=Zero())
293
+ ),
294
+ FORMAT_NZ,
295
+ )
296
+ for _ in range(self.num_layers)
297
+ ]
298
+ v_caches = [
299
+ _set_format(
300
+ Parameter(
301
+ Tensor(shape=kv_shape, dtype=self.dtype, init=Zero())
302
+ ),
303
+ FORMAT_NZ,
304
+ )
305
+ for _ in range(self.num_layers)
306
+ ]
156
307
  else:
157
- kv_shape = (self.config.num_blocks, self.config.block_size,
158
- self.num_kv_heads // self.device_num, self.head_dim)
159
- k_caches = [Parameter(Tensor(
160
- shape=kv_shape, dtype=self.dtype, init=Zero())) for _ in range(self.num_layers)]
161
- v_caches = [Parameter(Tensor(
162
- shape=kv_shape, dtype=self.dtype, init=Zero())) for _ in range(self.num_layers)]
163
-
308
+ kv_shape = (
309
+ self.config.num_blocks,
310
+ self.config.block_size,
311
+ self.num_kv_heads // self.device_num,
312
+ self.head_dim,
313
+ )
314
+ k_caches = [
315
+ Parameter(Tensor(shape=kv_shape, dtype=self.dtype, init=Zero()))
316
+ for _ in range(self.num_layers)
317
+ ]
318
+ v_caches = [
319
+ Parameter(Tensor(shape=kv_shape, dtype=self.dtype, init=Zero()))
320
+ for _ in range(self.num_layers)
321
+ ]
322
+ self.k_caches = k_caches
323
+ self.v_caches = v_caches
164
324
  self.atb_encoder_operation.set_kvcache(k_caches, v_caches)
165
325
  self.atb_decoder_operation.set_kvcache(k_caches, v_caches)
166
326
 
@@ -171,11 +331,9 @@ class AtbBoostBase():
171
331
  def _execute_operator(self, acl_inputs, acl_param):
172
332
  """execute operator."""
173
333
  if self.is_first_iteration:
174
- acl_model_out = self.atb_encoder_operation.forward(
175
- acl_inputs, acl_param)
334
+ acl_model_out = self.atb_encoder_operation.forward(acl_inputs, acl_param)
176
335
  else:
177
- acl_model_out = self.atb_decoder_operation.forward(
178
- acl_inputs, acl_param)
336
+ acl_model_out = self.atb_decoder_operation.forward(acl_inputs, acl_param)
179
337
  acl_hidden_state = acl_model_out[0]
180
338
  return acl_hidden_state
181
339
 
@@ -183,28 +341,46 @@ class AtbBoostBase():
183
341
  r"""
184
342
  LlmBoost forward.
185
343
  """
186
- input_ids = boost_inputs["input_ids"]
187
- position_ids = boost_inputs["position_ids"]
188
- cos_embed = boost_inputs["cos_embed"]
189
- sin_embed = boost_inputs["sin_embed"]
190
- block_tables = boost_inputs["block_tables"]
191
- slot_mapping = boost_inputs["slot_mapping"]
192
- batch_valid_length = boost_inputs["batch_valid_length"]
193
- lm_head_indices = boost_inputs["lm_head_indices"]
194
- seqLen = boost_inputs["seq_lens"]
344
+ input_ids = boost_inputs.get("input_ids", None)
345
+ position_ids = boost_inputs.get("position_ids", None)
346
+ cos_embed = boost_inputs.get("cos_embed", None)
347
+ sin_embed = boost_inputs.get("sin_embed", None)
348
+ block_tables = boost_inputs.get("block_tables", None)
349
+ slot_mapping = boost_inputs.get("slot_mapping", None)
350
+ batch_valid_length = boost_inputs.get("batch_valid_length", None)
351
+ lm_head_indices = boost_inputs.get("lm_head_indices", None)
352
+ seqLen = boost_inputs.get("seq_lens", None)
353
+ input_ids = self.reshape(input_ids, (-1,))
195
354
  if self.is_first_iteration:
196
355
  attention_mask = self.attn_mask
197
356
  else:
198
- position_ids = batch_valid_length - 1
357
+ if position_ids is None:
358
+ position_ids = batch_valid_length - 1
199
359
  attention_mask = self.placeholder
200
360
  lm_head_indices = self.lm_head_indices_fake
201
361
 
202
- acl_inputs, acl_param = self._prepare_inputs(prefill=self.is_first_iteration, input_ids=input_ids,
203
- position_ids=position_ids, cos_embed=cos_embed,
204
- sin_embed=sin_embed, attention_mask=attention_mask,
205
- block_tables=block_tables, slots=slot_mapping,
206
- input_lengths=batch_valid_length, lm_head_indices=lm_head_indices,
207
- seqLen=seqLen)
362
+ if input_ids is not None and input_ids.dtype != mstype.int64:
363
+ input_ids = self.cast(input_ids, mstype.int64)
364
+ if position_ids is not None and position_ids.dtype != mstype.int64:
365
+ position_ids = self.cast(position_ids, mstype.int64)
366
+ if batch_valid_length is not None and batch_valid_length.dtype != mstype.int32:
367
+ batch_valid_length = self.cast(batch_valid_length, mstype.int32)
368
+ if lm_head_indices is not None and lm_head_indices.dtype != mstype.int64:
369
+ lm_head_indices = self.cast(lm_head_indices, mstype.int64)
370
+
371
+ acl_inputs, acl_param = self._prepare_inputs(
372
+ prefill=self.is_first_iteration,
373
+ input_ids=input_ids,
374
+ position_ids=position_ids,
375
+ cos_embed=cos_embed,
376
+ sin_embed=sin_embed,
377
+ attention_mask=attention_mask,
378
+ block_tables=block_tables,
379
+ slots=slot_mapping,
380
+ input_lengths=batch_valid_length,
381
+ lm_head_indices=lm_head_indices,
382
+ seqLen=seqLen,
383
+ )
208
384
  ms.hal.synchronize()
209
385
  logits = self._execute_operator(acl_inputs, acl_param)
210
386
  logits = self.cast(logits, mstype.float32)
@@ -15,10 +15,16 @@
15
15
  """llm boost"""
16
16
  import json
17
17
  import mindspore.common.dtype as mstype
18
- from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase
18
+ from mindspore.experimental.llm_boost.atb.boost_base import (
19
+ AtbBoostBase,
20
+ PositionEmbeddingType,
21
+ NormType,
22
+ )
19
23
  from mindspore._c_expression import LlmBoostBinder
20
24
  from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
21
25
 
26
+ CPP_LLAMA_MODEL_CLASS_NAME = "llama_LlamaDecoderModel"
27
+
22
28
 
23
29
  @LlmBoostRegister.register(LlmBoostType.BUILDIN, "Llama")
24
30
  class LlamaBoost(AtbBoostBase):
@@ -30,14 +36,17 @@ class LlamaBoost(AtbBoostBase):
30
36
  self.acl_encoder_operation_inputs = [None] * self.in_tensor_length
31
37
  self.acl_decoder_operation_inputs = [None] * self.in_tensor_length
32
38
  self.atb_encoder_operation = LlmBoostBinder(
33
- "ATB", "llama_parallel_DecoderModel")
39
+ self.backend_name, CPP_LLAMA_MODEL_CLASS_NAME
40
+ )
34
41
  self.atb_decoder_operation = LlmBoostBinder(
35
- "ATB", "llama_parallel_DecoderModel")
42
+ self.backend_name, CPP_LLAMA_MODEL_CLASS_NAME
43
+ )
36
44
 
37
45
  def init(self):
38
46
  """set param"""
39
47
  coder_param = {
40
- "rmsNormEps": self.config.rms_norm_eps,
48
+ "normEps": self.config.rms_norm_eps,
49
+ "normType": NormType.RMS_NORM,
41
50
  "numAttentionHeadsPerRank": self.config.num_heads // self.device_num,
42
51
  "hiddenSizePerAttentionHead": self.head_dim,
43
52
  "numHiddenLayers": self.num_layers,
@@ -46,32 +55,41 @@ class LlamaBoost(AtbBoostBase):
46
55
  "isFA": False,
47
56
  "isBF16": self.dtype == mstype.bfloat16,
48
57
  "packQuantType": [[1, 1] for _ in range(self.num_layers)],
49
- "linearQuantType": [[0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)],
50
- "linearTransposeType": [[1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)],
58
+ "linearQuantType": [
59
+ [0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)
60
+ ],
61
+ "linearTransposeType": [
62
+ [1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)
63
+ ],
51
64
  "isEmbeddingParallel": False,
52
65
  "isLmHeadParallel": not self.config.parallel_config.vocab_emb_dp,
53
66
  "lmHeadTransposeType": 1,
54
- "supportSwiGLU": True,
55
- "kvQuant": self.kv_quant is not None,
67
+ "enableSwiGLU": True,
68
+ "enablekvQuant": self.kv_quant is not None,
56
69
  "rank": self.rank_id,
57
70
  "worldSize": self.device_num,
58
- "backend": "lccl",
71
+ "backend": self.config.communication_backend,
59
72
  "rankTableFile": "",
60
- "positionEmbeddingType": self.position_embedding_type,
73
+ "positionEmbeddingType": PositionEmbeddingType.ROPE,
61
74
  "hiddenSize": self.config.hidden_size,
62
75
  "gemma": False,
63
- "enableAddNorm": True,
64
- "supportCompressHead": False,
76
+ "enableAddNorm": False,
77
+ "enableCompressHead": False,
78
+ "isUnpadInputs": True,
65
79
  }
66
80
  encoder_param = {
67
- **coder_param, "isPrefill": True,
68
- "supportLcoc": True,
69
- "supportSpeculate": False,
70
- "skipWordEmbedding": False
81
+ **coder_param,
82
+ "isPrefill": True,
83
+ "enableLcoc": True,
84
+ "enableSpeculate": False,
85
+ "skipWordEmbedding": False,
86
+ "enableSplitFuse": False,
71
87
  }
72
88
  decoder_param = {
73
- **coder_param, "isPrefill": False, "supportLcoc": False,
74
- "supportSpeculate": False
89
+ **coder_param,
90
+ "isPrefill": False,
91
+ "enableLcoc": False,
92
+ "enableSpeculate": False,
75
93
  }
76
94
  self.atb_encoder_operation.init(json.dumps({**encoder_param}))
77
95
  self.atb_decoder_operation.init(json.dumps({**decoder_param}))
@@ -92,14 +110,15 @@ class LlamaBoost(AtbBoostBase):
92
110
  **kwargs
93
111
  ):
94
112
  """prepare inputs"""
95
- self.acl_param = json.dumps({
96
- "seqLen": seqLen,
97
- })
98
- self.acl_decoder_operation_inputs[0] = self.cast(
99
- input_ids, mstype.int64)
113
+ self.acl_param = json.dumps(
114
+ {
115
+ "seqLen": seqLen,
116
+ }
117
+ )
118
+
119
+ self.acl_decoder_operation_inputs[0] = input_ids
100
120
  self.acl_decoder_operation_inputs[1] = self.placeholder
101
- self.acl_decoder_operation_inputs[2] = self.cast(
102
- position_ids, mstype.int32)
121
+ self.acl_decoder_operation_inputs[2] = position_ids
103
122
  self.acl_decoder_operation_inputs[3] = cos_embed
104
123
  self.acl_decoder_operation_inputs[4] = sin_embed
105
124
  self.acl_decoder_operation_inputs[5] = attention_mask
@@ -108,8 +127,6 @@ class LlamaBoost(AtbBoostBase):
108
127
  self.acl_decoder_operation_inputs[8] = self.placeholder
109
128
  self.acl_decoder_operation_inputs[9] = self.placeholder
110
129
  self.acl_decoder_operation_inputs[10] = self.placeholder
111
- self.acl_decoder_operation_inputs[11] = self.cast(
112
- input_lengths, mstype.int32)
113
- self.acl_decoder_operation_inputs[12] = self.cast(
114
- lm_head_indices, mstype.int64)
130
+ self.acl_decoder_operation_inputs[11] = input_lengths
131
+ self.acl_decoder_operation_inputs[12] = lm_head_indices
115
132
  return self.acl_decoder_operation_inputs, self.acl_param
@@ -15,11 +15,14 @@
15
15
  """llm boost"""
16
16
  import json
17
17
  import mindspore.common.dtype as mstype
18
- from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase
18
+ from mindspore.experimental.llm_boost.atb.boost_base import AtbBoostBase, NormType
19
19
  from mindspore._c_expression import LlmBoostBinder
20
20
  from mindspore.experimental.llm_boost.register import LlmBoostRegister, LlmBoostType
21
21
 
22
22
 
23
+ CPP_QWEN_MODEL_CLASS_NAME = "qwen_QwenDecoderModel"
24
+
25
+
23
26
  @LlmBoostRegister.register(LlmBoostType.BUILDIN, "Qwen")
24
27
  class QwenBoost(AtbBoostBase):
25
28
  """QwenBoost class"""
@@ -30,9 +33,11 @@ class QwenBoost(AtbBoostBase):
30
33
  self.acl_encoder_operation_inputs = [None] * self.in_tensor_length
31
34
  self.acl_decoder_operation_inputs = [None] * self.in_tensor_length
32
35
  self.atb_encoder_operation = LlmBoostBinder(
33
- "ATB", "qwen_DecoderModel")
36
+ self.backend_name, CPP_QWEN_MODEL_CLASS_NAME
37
+ )
34
38
  self.atb_decoder_operation = LlmBoostBinder(
35
- "ATB", "qwen_DecoderModel")
39
+ self.backend_name, CPP_QWEN_MODEL_CLASS_NAME
40
+ )
36
41
 
37
42
  def init(self):
38
43
  """set param"""
@@ -42,24 +47,43 @@ class QwenBoost(AtbBoostBase):
42
47
  "withEmbedding": True,
43
48
  "isEmbeddingParallel": True,
44
49
  "isLmHeadParallel": True,
45
- "linearTransposeType": [[1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)],
50
+ "linearTransposeType": [
51
+ [1, -1, -1, 1, 1, -1, 1] for i in range(self.num_layers)
52
+ ],
46
53
  "lmHeadTransposeType": 1,
47
- "supportSwiGLU": not self.need_nz,
48
- "rmsNormEps": self.config.rms_norm_eps,
54
+ "enableSwiGLU": not self.need_nz,
55
+ "normEps": self.config.rms_norm_eps,
56
+ "normType": NormType.RMS_NORM,
49
57
  "numAttentionHeadsPerRank": self.config.num_heads // self.device_num,
50
58
  "hiddenSizePerAttentionHead": self.head_dim,
51
59
  "numHiddenLayers": self.num_layers,
52
60
  "numKeyValueHeadsPerRank": self.n_kv_heads // self.device_num,
53
61
  "rank": self.rank_id,
54
62
  "worldSize": self.device_num,
55
- "backend": "lccl",
63
+ "backend": self.config.communication_backend,
56
64
  "packQuantType": [[1, 1] for _ in range(self.num_layers)],
57
- "linearQuantType": [[0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)],
58
- "kvQuant": self.kv_quant is not None,
65
+ "linearQuantType": [
66
+ [0, -1, -1, 0, 0, -1, 0] for _ in range(self.num_layers)
67
+ ],
68
+ "linearHasBias": [[True, False, False, False]] * self.num_layers,
69
+ "enableKvQuant": self.kv_quant is not None,
70
+ "enableLora": False,
71
+ "isUnpadInputs": True,
72
+ "enableAddNorm": False,
73
+ }
74
+ encoder_param = {
75
+ **param_dict,
76
+ "isPrefill": True,
77
+ "enableLcoc": False,
78
+ "enableSplitFuse": False,
79
+ }
80
+ decoder_param = {
81
+ **param_dict,
82
+ "isPrefill": False,
83
+ "enableLcoc": False,
84
+ "enableSpeculate": False,
85
+ "enablePrefixCache": False,
59
86
  }
60
- encoder_param = {**param_dict, "isPrefill": True, "supportLcoc": False}
61
- decoder_param = {**param_dict, "isPrefill": False,
62
- "supportLcoc": False, "supportSpeculate": False}
63
87
  self.atb_encoder_operation.init(json.dumps({**encoder_param}))
64
88
  self.atb_decoder_operation.init(json.dumps({**decoder_param}))
65
89
 
@@ -79,13 +103,14 @@ class QwenBoost(AtbBoostBase):
79
103
  **kwargs
80
104
  ):
81
105
  """prepare inputs"""
82
- self.acl_param = json.dumps({
83
- "seqLen": seqLen,
84
- })
85
- self.acl_decoder_operation_inputs[0] = self.cast(
86
- input_ids, mstype.int64)
87
- self.acl_decoder_operation_inputs[1] = self.cast(
88
- position_ids, mstype.int32)
106
+ self.acl_param = json.dumps(
107
+ {
108
+ "seqLen": seqLen,
109
+ }
110
+ )
111
+
112
+ self.acl_decoder_operation_inputs[0] = input_ids
113
+ self.acl_decoder_operation_inputs[1] = position_ids
89
114
  self.acl_decoder_operation_inputs[2] = cos_embed
90
115
  self.acl_decoder_operation_inputs[3] = sin_embed
91
116
  self.acl_decoder_operation_inputs[4] = attention_mask
@@ -93,9 +118,7 @@ class QwenBoost(AtbBoostBase):
93
118
  self.acl_decoder_operation_inputs[6] = slots
94
119
  self.acl_decoder_operation_inputs[7] = self.placeholder
95
120
  self.acl_decoder_operation_inputs[8] = self.placeholder
96
- self.acl_decoder_operation_inputs[9] = self.cast(
97
- input_lengths, mstype.int32)
98
- self.acl_decoder_operation_inputs[10] = self.cast(
99
- lm_head_indices, mstype.int64)
100
- self.acl_decoder_operation_inputs[11] = self.placeholder
121
+ self.acl_decoder_operation_inputs[9] = self.placeholder
122
+ self.acl_decoder_operation_inputs[10] = input_lengths
123
+ self.acl_decoder_operation_inputs[11] = lm_head_indices
101
124
  return self.acl_decoder_operation_inputs, self.acl_param