ipex-llm 2.2.0b20250206__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250207__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -233,7 +233,6 @@ class _BaseAutoModelClass:
233
233
  optimize_model = False
234
234
  kwargs["modules_to_not_convert"] = ["lm_head"]
235
235
 
236
- load_in_8bit = kwargs.pop("load_in_8bit", False)
237
236
  from ipex_llm.llm_patching import bigdl_patched
238
237
  if bigdl_patched == 'Train':
239
238
  global patched_training_mode
@@ -117,7 +117,6 @@ class _BaseAutoModelClass:
117
117
  # ignore following arguments
118
118
  ignore_argument(kwargs, "model_hub")
119
119
  ignore_argument(kwargs, "load_in_4bit")
120
- ignore_argument(kwargs, "load_in_8bit")
121
120
  ignore_argument(kwargs, "imatrix")
122
121
  ignore_argument(kwargs, "cpu_embedding")
123
122
  ignore_argument(kwargs, "embedding_qtype")
@@ -98,6 +98,8 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
98
98
  n_splits_linear: int = 1,
99
99
  n_splits_down_proj: int = 1,
100
100
  group_size: int = 0,
101
+ cos_len: int = 1,
102
+ keep_position_ids=True,
101
103
  asym: bool = False,
102
104
  ):
103
105
  super().__init__(max_seq_len=max_seq_len,
@@ -114,18 +116,13 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
114
116
  self.dtype = dtype
115
117
  self.cached_cos = cached_cos
116
118
  self.cached_sin = cached_sin
119
+ self.cos_len = cos_len
117
120
  self.batch_size, self.seq_len, self.hidden_size = hidden_shape
118
121
  self.mode = mode
119
122
  self.rms_norm_eps = rms_norm_eps
120
123
  self.transpose_value = transpose_value
121
124
  self.num_layers = num_layers
122
125
 
123
- cos = self.constant(self.cached_cos)
124
- self.cos = self.unsqueeze(cos, axis=0)
125
-
126
- sin = self.constant(self.cached_sin)
127
- self.sin = self.unsqueeze(sin, axis=0)
128
-
129
126
  if mode == "decode":
130
127
  self.kv_seq_len = self.max_seq_len + 1
131
128
  else:
@@ -148,7 +145,21 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
148
145
  attention_mask = self.create_input_op(
149
146
  (self.batch_size, 1, self.seq_len, self.seq_len), dtype=np.float16)
150
147
 
151
- position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
148
+ if self.cached_cos is None:
149
+ if mode == "prefill" and keep_position_ids:
150
+ position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
151
+ cos = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
152
+ dtype=np.float32)
153
+ self.cos = self.convert_to_fp16(cos)
154
+ sin = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
155
+ dtype=np.float32)
156
+ self.sin = self.convert_to_fp16(sin)
157
+ else:
158
+ position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
159
+ cos = self.constant(self.cached_cos)
160
+ self.cos = self.unsqueeze(cos, axis=0)
161
+ sin = self.constant(self.cached_sin)
162
+ self.sin = self.unsqueeze(sin, axis=0)
152
163
 
153
164
  if input_layernorm_weights is None:
154
165
  input_layernorm_weights = []
@@ -211,11 +222,12 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
211
222
  hidden_states = input
212
223
 
213
224
  curr_key_values = []
225
+ cos_condition = cached_cos is not None or (mode == "prefill" and keep_position_ids)
214
226
  for i in range(num_layers):
215
227
  hidden_states, new_key_states, new_value_states = self.build_decoder(
216
228
  hidden_states=hidden_states,
217
229
  attention_mask=attention_mask,
218
- position_ids=position_ids,
230
+ position_ids=position_ids if cos_condition else None,
219
231
  input_layernorm_weight=input_layernorm_weights[i],
220
232
  post_attention_layernorm_weight=post_attn_layernorm_weights[i],
221
233
  q_bias=q_biases[i],
@@ -173,6 +173,105 @@ class LLMEmbedding(NNFactory):
173
173
  self.compile()
174
174
 
175
175
 
176
+ class Llama32Embedding(NNFactory):
177
+ def __init__(
178
+ self,
179
+ vocab_size,
180
+ embedding_dim,
181
+ embedding_weight,
182
+ padding_idx,
183
+ inv_freq,
184
+ attention_scaling,
185
+ dtype, # fp16
186
+ device: str = "NPU",
187
+ ):
188
+ super().__init__(False, device)
189
+ self.vocab_size = vocab_size
190
+ self.embedding_dim = embedding_dim
191
+ self.padding_idx = padding_idx
192
+ self.attention_scaling = attention_scaling
193
+ self.dtype = dtype
194
+
195
+ # define input
196
+ weight = self.constant(embedding_weight)
197
+ input = self.parameter((1, 1), dtype=np.int32)
198
+ position_ids = self.parameter((1, 1), dtype=np.int64)
199
+ inv_freq = self.constant(inv_freq)
200
+
201
+ # embed_tokens module
202
+ if padding_idx == -1:
203
+ padding_idx += vocab_size
204
+
205
+ axis_node = self.constant(np.array([0], dtype=np.int64))
206
+ if padding_idx is not None:
207
+ masked_embeddings = np.ones(weight.shape, dtype=np.float16)
208
+ masked_embeddings[padding_idx, :] = 0.0 # mask
209
+
210
+ node_mask = self.constant(masked_embeddings)
211
+ node_masked_w = self.eltwise_mul(weight, node_mask)
212
+ res = self.gather(node_masked_w, input, axis_node, 0)
213
+ else:
214
+ res = self.gather(weight, input, axis_node, 0)
215
+
216
+ # rotary_emb module
217
+ inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
218
+ position_ids = self.reshape(position_ids, (1, 1, 1))
219
+ freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
220
+ self.convert_to_fp32(position_ids))
221
+ freqs = self.transpose(freqs, [0, 2, 1])
222
+ emb = self.concat(freqs, freqs, axis=2)
223
+ cos = self.cos(emb)
224
+ sin = self.sin(emb)
225
+ cos = cos * self.attention_scaling
226
+ sin = sin * self.attention_scaling
227
+
228
+ # define outputs
229
+ res = self.convert_to_fp16(res)
230
+ cos = self.convert_to_fp32(cos)
231
+ sin = self.convert_to_fp32(sin)
232
+
233
+ print("start compiling")
234
+ self.compile()
235
+
236
+
237
+ class Llama32PostEmbedding(NNFactory):
238
+ def __init__(
239
+ self,
240
+ inv_freq,
241
+ attention_scaling,
242
+ input_len: int = 1,
243
+ device: str = "NPU",
244
+ ):
245
+ super().__init__(False, device)
246
+ self.attention_scaling = attention_scaling
247
+
248
+ # define input
249
+ position_ids = self.parameter((1, input_len), dtype=np.int64)
250
+ inv_freq = self.constant(inv_freq)
251
+
252
+ # rotary_emb module
253
+ inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
254
+ position_ids = self.reshape(position_ids, (1, 1, input_len))
255
+ freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
256
+ self.convert_to_fp32(position_ids))
257
+ freqs = self.transpose(freqs, [0, 2, 1])
258
+ emb = self.concat(freqs, freqs, axis=2)
259
+ cos = self.cos(emb)
260
+ sin = self.sin(emb)
261
+ cos = cos * self.attention_scaling
262
+ sin = sin * self.attention_scaling
263
+ if input_len > 1:
264
+ cos = self.unsqueeze(cos, [1])
265
+ sin = self.unsqueeze(sin, [1])
266
+
267
+ # define outputs
268
+ cos = self.convert_to_fp32(cos)
269
+ sin = self.convert_to_fp32(sin)
270
+
271
+ print("start compiling")
272
+ self.compile()
273
+
274
+
176
275
  def obtain_weight_from_single_layer(attn_layer, mlp_layer):
177
276
  weights = []
178
277
  if hasattr(attn_layer, "q_proj_dq_list"):
@@ -216,3 +315,65 @@ def obtain_qkv_bias_from_single_layer(attn_layer):
216
315
  k_bias = attn_layer.k_proj.bias.to(torch.float16)
217
316
  v_bias = attn_layer.v_proj.bias.to(torch.float16)
218
317
  return q_bias, k_bias, v_bias
318
+
319
+
320
+ def obtain_embedding_from_model(model, convert_model, temp_dir, weight_dir,
321
+ max_prompt_len, keep_ir, compile_blob):
322
+ if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
323
+ # llama-2-7B & llama-3-8B
324
+ embedding_layer = model.model.embed_tokens
325
+ new_embedding = LLMEmbedding(
326
+ vocab_size=model.config.vocab_size,
327
+ embedding_dim=model.config.hidden_size,
328
+ embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
329
+ padding_idx=model.config.pad_token_id,
330
+ dtype=np.float16,
331
+ )
332
+ if convert_model:
333
+ bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
334
+ embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
335
+ first_blob_path = None
336
+ else:
337
+ first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
338
+ temp_dir, keep_ir=keep_ir,
339
+ compile_blob=compile_blob)
340
+ os.remove(os.path.join(temp_dir, "embedding.bin"))
341
+ else:
342
+ # llama-3.2-3B & llama-3.2-1B
343
+ # for transformers >= 4.45.0
344
+ embedding_layer = model.model.embed_tokens
345
+ new_embedding = Llama32Embedding(
346
+ vocab_size=model.config.vocab_size,
347
+ embedding_dim=model.config.hidden_size,
348
+ embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
349
+ padding_idx=model.config.pad_token_id,
350
+ inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
351
+ attention_scaling=model.model.rotary_emb.attention_scaling,
352
+ dtype=np.float16,
353
+ )
354
+ if convert_model:
355
+ bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
356
+ embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
357
+ first_blob_path = None
358
+ # save embedding post module
359
+ inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
360
+ attention_scaling = model.model.rotary_emb.attention_scaling
361
+ embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
362
+ attention_scaling=attention_scaling,
363
+ input_len=1)
364
+ update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
365
+ temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
366
+ embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
367
+ attention_scaling=attention_scaling,
368
+ input_len=max_prompt_len)
369
+ update_names_of_IR_and_export_blob(embedding_post_prefill,
370
+ "embedding_post_prefill",
371
+ temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
372
+ os.remove(os.path.join(temp_dir, "embedding_post.bin"))
373
+ os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
374
+ else:
375
+ first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
376
+ temp_dir, keep_ir=keep_ir,
377
+ compile_blob=compile_blob)
378
+ os.remove(os.path.join(temp_dir, "embedding.bin"))
379
+ return first_blob_path
@@ -31,6 +31,7 @@ import tempfile
31
31
  import numpy as np
32
32
  from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
33
33
  from multiprocessing import Pool
34
+ import transformers
34
35
 
35
36
 
36
37
  def generate(
@@ -456,6 +457,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
456
457
  custom_object_save(model, save_directory, config=model.config)
457
458
 
458
459
  if model.config.model_type == "qwen2":
460
+ cos_sin_input = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
461
+ embedding_post = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
459
462
  if group_size == 0:
460
463
  if model.config.hidden_size == 1536:
461
464
  # Qwen2-1.5B-Instruct
@@ -476,6 +479,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
476
479
  "use_prefill_sdp": False,
477
480
  "weight_num": 7,
478
481
  "weight_idx": 8,
482
+ "embedding_post": embedding_post,
483
+ "cos_sin_input": cos_sin_input,
479
484
  "n_splits_linear": n_splits_linear,
480
485
  "n_splits_down_proj": n_splits_down_proj,
481
486
  "lm_head_low_bit": lm_head_low_bit}
@@ -493,8 +498,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
493
498
  group_size, layernorm_const, "prefill",
494
499
  keep_ir=keep_ir, compile_blob=compile_blob)
495
500
  # save blob of lmhead and bin of embedding
496
- convert_lm_head_and_embedding(model, save_directory, weight_dir,
497
- convert_model=True, group_size=group_size,
501
+ convert_lm_head_and_embedding(model, save_directory, weight_dir, convert_model=True,
502
+ group_size=group_size, max_prompt_len=max_prompt_len,
498
503
  keep_ir=keep_ir, compile_blob=compile_blob)
499
504
  elif model.config.model_type == "llama":
500
505
  embedding_post = False
@@ -18,108 +18,8 @@
18
18
  import torch
19
19
  import numpy as np
20
20
  import os
21
- from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
22
- obtain_weight_from_single_layer
23
- from intel_npu_acceleration_library.backend.factory import NNFactory
24
-
25
-
26
- class Llama32Embedding(NNFactory):
27
- def __init__(
28
- self,
29
- vocab_size,
30
- embedding_dim,
31
- embedding_weight,
32
- padding_idx,
33
- inv_freq,
34
- attention_scaling,
35
- dtype, # fp16
36
- device: str = "NPU",
37
- ):
38
- super().__init__(False, device)
39
- self.vocab_size = vocab_size
40
- self.embedding_dim = embedding_dim
41
- self.padding_idx = padding_idx
42
- self.attention_scaling = attention_scaling
43
- self.dtype = dtype
44
-
45
- # define input
46
- weight = self.constant(embedding_weight)
47
- input = self.parameter((1, 1), dtype=np.int32)
48
- position_ids = self.parameter((1, 1), dtype=np.int64)
49
- inv_freq = self.constant(inv_freq)
50
-
51
- # embed_tokens module
52
- if padding_idx == -1:
53
- padding_idx += vocab_size
54
-
55
- axis_node = self.constant(np.array([0], dtype=np.int64))
56
- if padding_idx is not None:
57
- masked_embeddings = np.ones(weight.shape, dtype=np.float16)
58
- masked_embeddings[padding_idx, :] = 0.0 # mask
59
-
60
- node_mask = self.constant(masked_embeddings)
61
- node_masked_w = self.eltwise_mul(weight, node_mask)
62
- res = self.gather(node_masked_w, input, axis_node, 0)
63
- else:
64
- res = self.gather(weight, input, axis_node, 0)
65
-
66
- # rotary_emb module
67
- inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
68
- position_ids = self.reshape(position_ids, (1, 1, 1))
69
- freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
70
- self.convert_to_fp32(position_ids))
71
- freqs = self.transpose(freqs, [0, 2, 1])
72
- emb = self.concat(freqs, freqs, axis=2)
73
- cos = self.cos(emb)
74
- sin = self.sin(emb)
75
- cos = cos * self.attention_scaling
76
- sin = sin * self.attention_scaling
77
-
78
- # define outputs
79
- res = self.convert_to_fp16(res)
80
- cos = self.convert_to_fp32(cos)
81
- sin = self.convert_to_fp32(sin)
82
-
83
- print("start compiling")
84
- self.compile()
85
-
86
-
87
- class Llama32PostEmbedding(NNFactory):
88
- def __init__(
89
- self,
90
- inv_freq,
91
- attention_scaling,
92
- input_len: int = 1,
93
- device: str = "NPU",
94
- ):
95
- super().__init__(False, device)
96
- self.attention_scaling = attention_scaling
97
-
98
- # define input
99
- position_ids = self.parameter((1, input_len), dtype=np.int64)
100
- inv_freq = self.constant(inv_freq)
101
-
102
- # rotary_emb module
103
- inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
104
- position_ids = self.reshape(position_ids, (1, 1, input_len))
105
- freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
106
- self.convert_to_fp32(position_ids))
107
- freqs = self.transpose(freqs, [0, 2, 1])
108
- emb = self.concat(freqs, freqs, axis=2)
109
- cos = self.cos(emb)
110
- sin = self.sin(emb)
111
- cos = cos * self.attention_scaling
112
- sin = sin * self.attention_scaling
113
- if input_len > 1:
114
- cos = self.unsqueeze(cos, [1])
115
- sin = self.unsqueeze(sin, [1])
116
-
117
- # define outputs
118
- cos = self.convert_to_fp32(cos)
119
- sin = self.convert_to_fp32(sin)
120
-
121
- print("start compiling")
122
- self.compile()
21
+ from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
22
+ obtain_weight_from_single_layer, obtain_embedding_from_model
123
23
 
124
24
 
125
25
  def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
@@ -197,62 +97,10 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
197
97
  bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
198
98
  weight.tofile(bin_file)
199
99
 
200
- if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
201
- # llama-2-7B & llama-3-8B
202
- embedding_layer = model.model.embed_tokens
203
- new_embedding = LLMEmbedding(
204
- vocab_size=model.config.vocab_size,
205
- embedding_dim=model.config.hidden_size,
206
- embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
207
- padding_idx=model.config.pad_token_id,
208
- dtype=np.float16,
209
- )
210
- if convert_model:
211
- bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
212
- embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
213
- first_blob_path = None
214
- else:
215
- first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
216
- temp_dir, keep_ir=keep_ir,
217
- compile_blob=compile_blob)
218
- os.remove(os.path.join(temp_dir, "embedding.bin"))
219
- else:
220
- # llama-3.2-3B & llama-3.2-1B
221
- embedding_layer = model.model.embed_tokens
222
- new_embedding = Llama32Embedding(
223
- vocab_size=model.config.vocab_size,
224
- embedding_dim=model.config.hidden_size,
225
- embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
226
- padding_idx=model.config.pad_token_id,
227
- inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
228
- attention_scaling=model.model.rotary_emb.attention_scaling,
229
- dtype=np.float16,
230
- )
231
- if convert_model:
232
- bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
233
- embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
234
- first_blob_path = None
235
- # save embedding post module
236
- inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
237
- attention_scaling = model.model.rotary_emb.attention_scaling
238
- embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
239
- attention_scaling=attention_scaling,
240
- input_len=1)
241
- update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
242
- temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
243
- embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
244
- attention_scaling=attention_scaling,
245
- input_len=max_prompt_len)
246
- update_names_of_IR_and_export_blob(embedding_post_prefill,
247
- "embedding_post_prefill",
248
- temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
249
- os.remove(os.path.join(temp_dir, "embedding_post.bin"))
250
- os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
251
- else:
252
- first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
253
- temp_dir, keep_ir=keep_ir,
254
- compile_blob=compile_blob)
255
- os.remove(os.path.join(temp_dir, "embedding.bin"))
100
+ first_blob_path = obtain_embedding_from_model(model, convert_model,
101
+ temp_dir, weight_dir,
102
+ max_prompt_len,
103
+ keep_ir, compile_blob)
256
104
 
257
105
  return first_blob_path, last_blob_path
258
106
 
@@ -18,13 +18,14 @@
18
18
  import torch
19
19
  import numpy as np
20
20
  import os
21
- from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
22
- obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer
21
+ from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
22
+ obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer, \
23
+ obtain_embedding_from_model
23
24
  from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
24
25
 
25
26
 
26
27
  def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
27
- convert_model=False, group_size=0,
28
+ convert_model=False, group_size=0, max_prompt_len=1,
28
29
  keep_ir=False, compile_blob=True):
29
30
  num_heads = model.model.layers[0].self_attn.num_heads
30
31
  head_dim = model.model.layers[0].self_attn.head_dim
@@ -107,24 +108,10 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
107
108
  bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
108
109
  weight.tofile(bin_file)
109
110
 
110
- embedding_layer = model.model.embed_tokens
111
- new_embedding = LLMEmbedding(
112
- vocab_size=model.config.vocab_size,
113
- embedding_dim=model.config.hidden_size,
114
- embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
115
- padding_idx=model.config.pad_token_id,
116
- dtype=np.float16,
117
- input_length=1,
118
- )
119
- if convert_model:
120
- bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
121
- embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
122
- first_blob_path = True
123
- else:
124
- first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
125
- temp_dir, keep_ir=keep_ir,
126
- compile_blob=compile_blob)
127
- os.remove(os.path.join(temp_dir, "embedding.bin"))
111
+ first_blob_path = obtain_embedding_from_model(model, convert_model,
112
+ temp_dir, weight_dir,
113
+ max_prompt_len,
114
+ keep_ir, compile_blob)
128
115
  return first_blob_path, last_blob_path
129
116
 
130
117
 
@@ -145,8 +132,13 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
145
132
  mlp_layer = curr_layer.mlp
146
133
  weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
147
134
  q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer)
148
- cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
149
- cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
135
+ if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
136
+ cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
137
+ cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
138
+ else:
139
+ # transformers >= 4.45.0
140
+ cached_cos = None
141
+ cached_sin = None
150
142
  layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
151
143
  layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
152
144
 
@@ -158,10 +150,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
158
150
  if mode == "decode":
159
151
  input_len = 1
160
152
  decoder_name = f"decoder_layer_{layer_idx}"
153
+ keep_position_ids = True
161
154
  npu_dpu_groups = None
162
155
  else:
163
156
  input_len = kv_len
164
157
  decoder_name = "decoder_layer_prefill"
158
+ keep_position_ids = False
165
159
  npu_dpu_groups = 6
166
160
 
167
161
  single_decoder = LowBitQwenMultiDecoderlayer(
@@ -185,6 +179,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
185
179
  n_splits_linear=n_splits_linear,
186
180
  n_splits_down_proj=n_splits_down_proj,
187
181
  group_size=group_size,
182
+ cos_len=input_len,
183
+ keep_position_ids=keep_position_ids,
188
184
  asym=asym
189
185
  )
190
186
  rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
@@ -196,14 +192,25 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
196
192
 
197
193
  # 0, 1, 2 are input_embed/attention_mask/position_id
198
194
  if mode == "decode":
199
- if layernorm_const:
200
- st_idx = 3
195
+ if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
196
+ if layernorm_const:
197
+ st_idx = 3
198
+ else:
199
+ input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
200
+ post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
201
+ layer_norm_0.data.numpy().tofile(input_lm_bin_file)
202
+ layer_norm_1.data.numpy().tofile(post_lm_bin_file)
203
+ st_idx = 5
201
204
  else:
202
- input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
203
- post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
204
- layer_norm_0.data.numpy().tofile(input_lm_bin_file)
205
- layer_norm_1.data.numpy().tofile(post_lm_bin_file)
206
- st_idx = 5
205
+ # transformers >= 4.45.0
206
+ if layernorm_const:
207
+ st_idx = 4
208
+ else:
209
+ input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
210
+ post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_5.bin")
211
+ layer_norm_0.data.numpy().tofile(input_lm_bin_file)
212
+ layer_norm_1.data.numpy().tofile(post_lm_bin_file)
213
+ st_idx = 6
207
214
  q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
208
215
  k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
209
216
  v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
@@ -261,8 +268,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
261
268
  attn_layer = curr_layer.self_attn
262
269
  mlp_layer = curr_layer.mlp
263
270
  weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
264
- cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
265
- cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
271
+ if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
272
+ cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
273
+ cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
274
+ else:
275
+ # transformers >= 4.45.0
276
+ cached_cos = None
277
+ cached_sin = None
266
278
  layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
267
279
  layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
268
280
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ipex-llm
3
- Version: 2.2.0b20250206
3
+ Version: 2.2.0b20250207
4
4
  Summary: Large Language Model Develop Toolkit
5
5
  Home-page: https://github.com/intel-analytics/ipex-llm
6
6
  Author: BigDL Authors
@@ -27,19 +27,12 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
27
27
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
28
28
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
29
29
  Provides-Extra: cpp
30
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250206 ; extra == 'cpp'
30
+ Requires-Dist: bigdl-core-cpp ==2.6.0b20250207 ; extra == 'cpp'
31
31
  Requires-Dist: setuptools ; extra == 'cpp'
32
- Provides-Extra: cpp-arl
33
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250206 ; extra == 'cpp-arl'
34
- Requires-Dist: setuptools ; extra == 'cpp-arl'
35
- Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
36
- Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
37
- Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
38
- Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
39
- Requires-Dist: onednn-devel ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
40
- Requires-Dist: onednn ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
41
- Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
42
- Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
32
+ Requires-Dist: onednn-devel ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
33
+ Requires-Dist: onednn ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
34
+ Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; (platform_system == "Windows") and extra == 'cpp'
35
+ Requires-Dist: mkl-dpcpp ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
43
36
  Provides-Extra: llama-index
44
37
  Requires-Dist: py-cpuinfo ; extra == 'llama-index'
45
38
  Requires-Dist: protobuf ; extra == 'llama-index'
@@ -67,7 +60,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
67
60
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
68
61
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
69
62
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
70
- Requires-Dist: bigdl-core-npu ==2.6.0b20250206 ; (platform_system == "Windows") and extra == 'npu'
63
+ Requires-Dist: bigdl-core-npu ==2.6.0b20250207 ; (platform_system == "Windows") and extra == 'npu'
71
64
  Provides-Extra: serving
72
65
  Requires-Dist: py-cpuinfo ; extra == 'serving'
73
66
  Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -87,9 +80,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
87
80
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
88
81
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
89
82
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
90
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250206 ; extra == 'xpu'
91
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250206 ; extra == 'xpu'
92
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250206 ; extra == 'xpu'
83
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250207 ; extra == 'xpu'
84
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250207 ; extra == 'xpu'
85
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250207 ; extra == 'xpu'
93
86
  Provides-Extra: xpu-2-1
94
87
  Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
95
88
  Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -104,9 +97,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
104
97
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
105
98
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
106
99
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
107
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
108
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
109
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
100
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250207 ; extra == 'xpu-2-1'
101
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250207 ; extra == 'xpu-2-1'
102
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250207 ; extra == 'xpu-2-1'
110
103
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
111
104
  Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
112
105
  Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -124,7 +117,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
124
117
  Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
125
118
  Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
126
119
  Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
127
- Requires-Dist: bigdl-core-xe-all ==2.6.0b20250206 ; extra == 'xpu-2-6'
120
+ Requires-Dist: bigdl-core-xe-all ==2.6.0b20250207 ; extra == 'xpu-2-6'
128
121
  Requires-Dist: onednn-devel ==2025.0.1 ; extra == 'xpu-2-6'
129
122
  Requires-Dist: onednn ==2025.0.1 ; extra == 'xpu-2-6'
130
123
  Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; extra == 'xpu-2-6'
@@ -140,9 +133,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
140
133
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
141
134
  Requires-Dist: tabulate ; extra == 'xpu-arc'
142
135
  Requires-Dist: setuptools ; extra == 'xpu-arc'
143
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
144
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
145
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
136
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250207 ; extra == 'xpu-arc'
137
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250207 ; extra == 'xpu-arc'
138
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250207 ; extra == 'xpu-arc'
146
139
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
147
140
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
148
141
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -163,9 +156,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
163
156
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
164
157
  Requires-Dist: tabulate ; extra == 'xpu-arl'
165
158
  Requires-Dist: setuptools ; extra == 'xpu-arl'
166
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
167
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
168
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
159
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250207 ; extra == 'xpu-arl'
160
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250207 ; extra == 'xpu-arl'
161
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250207 ; extra == 'xpu-arl'
169
162
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
170
163
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
171
164
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -186,9 +179,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
186
179
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
187
180
  Requires-Dist: tabulate ; extra == 'xpu-lnl'
188
181
  Requires-Dist: setuptools ; extra == 'xpu-lnl'
189
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
190
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
191
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
182
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250207 ; extra == 'xpu-lnl'
183
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250207 ; extra == 'xpu-lnl'
184
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250207 ; extra == 'xpu-lnl'
192
185
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
193
186
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
194
187
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
@@ -102,9 +102,9 @@ ipex_llm/transformers/lisa.py,sha256=F5WxbtXQ7RdKulj83h_2DnEIgKiKGZf7zvOmg6QBl2s
102
102
  ipex_llm/transformers/loader.py,sha256=AwjV5RpI2t2bedlv7ZhLm8cfd-QJZm5hny-XyjIvdnk,6876
103
103
  ipex_llm/transformers/lookup.py,sha256=b6OlZ9OV10R9qeWw8mVryVpDxszkjwLkldvi7GPMJY8,19614
104
104
  ipex_llm/transformers/low_bit_linear.py,sha256=3EtbiCAq5HU_r2pGJ9beSDK4NPTN8Jj-aHMqm1jqX18,39177
105
- ipex_llm/transformers/model.py,sha256=cQJNlAkdfoWmVbWd-TS2hf-Do41mMO9orPvG3FO4Nns,40855
105
+ ipex_llm/transformers/model.py,sha256=FyHrEQhkHxG3FbGkhTjVOP2rgFMjc3AXcjDwvvB0HqU,40798
106
106
  ipex_llm/transformers/modelling_bigdl.py,sha256=7JpNVMuyq_OmtNUaMFMXdxPWZp2q0QHC02QeA-VTPOw,6709
107
- ipex_llm/transformers/npu_model.py,sha256=LMmRmhq8IAN9FrXLUeUK2B8XS2OJ5GVWmG0cEdeK-ro,40354
107
+ ipex_llm/transformers/npu_model.py,sha256=zgXOiLIJ-3p-1Kejgv4jUFK8OiBZbezMZrRyn0_6_8c,40306
108
108
  ipex_llm/transformers/patches.py,sha256=G9KcXxo42H1HJEDaroq4JbBN5P0P0lty7U7kk7-g4tw,991
109
109
  ipex_llm/transformers/pipeline_parallel.py,sha256=uNZpOXljNmdoEYnP8U-VFiN4dRZb2piQbIf2bG9LQnE,49051
110
110
  ipex_llm/transformers/qlora.py,sha256=qV9Y6G5kAaet77LLA3oXn3qQY4ayyAPZ7NAjOlHCS7g,14967
@@ -209,17 +209,17 @@ ipex_llm/transformers/npu_models/paraformer_mp.py,sha256=lGEjmKHW_Pk3BE3nqa1ZVgJ
209
209
  ipex_llm/transformers/npu_models/phi3.py,sha256=R-EuqHsTrPTX33HtCGAMFlRdXB_j5mH_7FDnj62JtNM,6555
210
210
  ipex_llm/transformers/npu_models/phi3_v.py,sha256=EMZuTPkGfuDVp9c5BU1HyzXHWKswHRQ8bvQjzocIyHA,7737
211
211
  ipex_llm/transformers/npu_models/qwen2.py,sha256=RDNtPK8kxMk3z8A4S53saTrw2klgkzo4oa7voJLwr1o,12085
212
- ipex_llm/transformers/npu_models/qwen2_mp.py,sha256=dnxpkLVW2bUsL4V-kZTyT2itc5aOpIB5vP3U7FtWdrs,44184
212
+ ipex_llm/transformers/npu_models/qwen2_mp.py,sha256=EKiI80rnQ43WUF_2wWCy75mx-rbjAbRQSB49OgjZFNo,45003
213
213
  ipex_llm/transformers/npu_models/stablelm.py,sha256=0iUhdjFqFd0svuTd09wP60mbEtobPkNSj-1I1vfuhsU,7778
214
214
  ipex_llm/transformers/npu_models/xlm_mp.py,sha256=sj8OVun8xJprM7ZJp0XzWa55rqlSIzNMdKmI9i6jlDU,28332
215
215
  ipex_llm/transformers/npu_pipeline_model/__init__.py,sha256=b2IXvVqQ5cItki021h8s3ymW12RPu8QNPprq4Mn3bDM,586
216
216
  ipex_llm/transformers/npu_pipeline_model/baichuan.py,sha256=ICxRzFQ4OIANDkkVi2_4xOeQXmfFXYMx3H52KuE1xR4,6208
217
- ipex_llm/transformers/npu_pipeline_model/common.py,sha256=QxJoJESpv0BpwO_FBeAT2wKA56wNFfen8iI37PrMKuA,7838
218
- ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256=-eHNbRuX2QhYd0-jCyo2pZpHTZTZ108bhObYx8a3CJs,29494
219
- ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=pmAnawfAn0W8XSr8kGWfxR1HylCLa-Y6mKpFeX-m8UY,20892
217
+ ipex_llm/transformers/npu_pipeline_model/common.py,sha256=faooJmM75qnVyZYuQLx9gJpVlotcVF4qXRCnOrknfk4,14776
218
+ ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256=_l4RFmyBMbREo8vzKpHXAMtE202JVQ41Y2lPg1qCOMI,29846
219
+ ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=j2sipfFSrzV2VgLKPOClMHwWIDXqDsL1jIQJK25hneo,14397
220
220
  ipex_llm/transformers/npu_pipeline_model/minicpm.py,sha256=H7j_UaHj-IwEBriQ-bunle0-8s2NmvqnL9eYuixnmFc,21398
221
221
  ipex_llm/transformers/npu_pipeline_model/pipeline_cpp.py,sha256=JNmodAMg_NQvDILug3E_fGXEh6cd3wsj4bvAzcd-vaU,2749
222
- ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=FAfoPlKEAxeU6-J8ltpSev5ithm9AC-urtreu6NGpME,15509
222
+ ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=6MNtCL1CXoR19B4tKZSgv2e5gtma9bqDG7DOYMCnPt0,16013
223
223
  ipex_llm/utils/__init__.py,sha256=LlUgrD03rfw4iY8zWPtHH6p65Gw76waVOLHaqagETw0,1425
224
224
  ipex_llm/utils/benchmark_util_4_29.py,sha256=OU1W1quiaiJGsg1pd3HM9O6PmVSaPA0HHE7R8hNTfmQ,258653
225
225
  ipex_llm/utils/benchmark_util_4_42.py,sha256=HEiClCgKDp_T64HH8ulSTly8dvt6UwPDYZfrPVYvXcc,225383
@@ -255,11 +255,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
255
255
  ipex_llm/vllm/xpu/engine/engine.py,sha256=k4-D27WS_Gk3mA--w3HWAjPjb4Aiu043MVPi0ZoAUBc,5984
256
256
  ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=GshTZFB8e4PWvqckfbmTOU6b0oLkNn7A-vzLuG9--j8,21544
257
257
  ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=2rENA2ucynMaIjiZBEh2ez1o5vR32GaP514t39CD7KM,8676
258
- ipex_llm-2.2.0b20250206.data/scripts/ipex-llm-init,sha256=fLQsT2dRL6H5bThb4GuIWotAuqoLsIxFwA-0c2qmaO8,6672
259
- ipex_llm-2.2.0b20250206.data/scripts/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
260
- ipex_llm-2.2.0b20250206.data/scripts/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
261
- ipex_llm-2.2.0b20250206.dist-info/METADATA,sha256=pAr_-dBEJB_J2lV8oNgJkJ5bGTObiseNHISkXAGkY9I,12879
262
- ipex_llm-2.2.0b20250206.dist-info/WHEEL,sha256=PPJcBMAZibF_2GFE9NmOJGqiaSMPiNFbJd6QaJjdA6Y,109
263
- ipex_llm-2.2.0b20250206.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
264
- ipex_llm-2.2.0b20250206.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
265
- ipex_llm-2.2.0b20250206.dist-info/RECORD,,
258
+ ipex_llm-2.2.0b20250207.data/scripts/ipex-llm-init,sha256=fLQsT2dRL6H5bThb4GuIWotAuqoLsIxFwA-0c2qmaO8,6672
259
+ ipex_llm-2.2.0b20250207.data/scripts/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
260
+ ipex_llm-2.2.0b20250207.data/scripts/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
261
+ ipex_llm-2.2.0b20250207.dist-info/METADATA,sha256=d1hx5hE5Xeb3lHGWqeF35SK9GZOX6syXJ_Syu5b35IU,12369
262
+ ipex_llm-2.2.0b20250207.dist-info/WHEEL,sha256=PPJcBMAZibF_2GFE9NmOJGqiaSMPiNFbJd6QaJjdA6Y,109
263
+ ipex_llm-2.2.0b20250207.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
264
+ ipex_llm-2.2.0b20250207.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
265
+ ipex_llm-2.2.0b20250207.dist-info/RECORD,,