ipex-llm 2.2.0b20250206__py3-none-win_amd64.whl → 2.2.0b20250207__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/model.py +0 -1
  31. ipex_llm/transformers/npu_model.py +0 -1
  32. ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
  33. ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
  34. ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +7 -2
  35. ipex_llm/transformers/npu_pipeline_model/llama.py +6 -158
  36. ipex_llm/transformers/npu_pipeline_model/qwen.py +44 -32
  37. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/METADATA +23 -30
  38. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/RECORD +44 -44
  39. {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250207.data}/scripts/ipex-llm-init.bat +0 -0
  40. {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-chat.ps1 +0 -0
  41. {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-cli.ps1 +0 -0
  42. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/WHEEL +0 -0
  43. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/entry_points.txt +0 -0
  44. {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/top_level.txt +0 -0
Binary file
ipex_llm/libs/bloom.dll CHANGED
Binary file
Binary file
ipex_llm/libs/gptneox.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
ipex_llm/libs/llama.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -233,7 +233,6 @@ class _BaseAutoModelClass:
233
233
  optimize_model = False
234
234
  kwargs["modules_to_not_convert"] = ["lm_head"]
235
235
 
236
- load_in_8bit = kwargs.pop("load_in_8bit", False)
237
236
  from ipex_llm.llm_patching import bigdl_patched
238
237
  if bigdl_patched == 'Train':
239
238
  global patched_training_mode
@@ -117,7 +117,6 @@ class _BaseAutoModelClass:
117
117
  # ignore following arguments
118
118
  ignore_argument(kwargs, "model_hub")
119
119
  ignore_argument(kwargs, "load_in_4bit")
120
- ignore_argument(kwargs, "load_in_8bit")
121
120
  ignore_argument(kwargs, "imatrix")
122
121
  ignore_argument(kwargs, "cpu_embedding")
123
122
  ignore_argument(kwargs, "embedding_qtype")
@@ -98,6 +98,8 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
98
98
  n_splits_linear: int = 1,
99
99
  n_splits_down_proj: int = 1,
100
100
  group_size: int = 0,
101
+ cos_len: int = 1,
102
+ keep_position_ids=True,
101
103
  asym: bool = False,
102
104
  ):
103
105
  super().__init__(max_seq_len=max_seq_len,
@@ -114,18 +116,13 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
114
116
  self.dtype = dtype
115
117
  self.cached_cos = cached_cos
116
118
  self.cached_sin = cached_sin
119
+ self.cos_len = cos_len
117
120
  self.batch_size, self.seq_len, self.hidden_size = hidden_shape
118
121
  self.mode = mode
119
122
  self.rms_norm_eps = rms_norm_eps
120
123
  self.transpose_value = transpose_value
121
124
  self.num_layers = num_layers
122
125
 
123
- cos = self.constant(self.cached_cos)
124
- self.cos = self.unsqueeze(cos, axis=0)
125
-
126
- sin = self.constant(self.cached_sin)
127
- self.sin = self.unsqueeze(sin, axis=0)
128
-
129
126
  if mode == "decode":
130
127
  self.kv_seq_len = self.max_seq_len + 1
131
128
  else:
@@ -148,7 +145,21 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
148
145
  attention_mask = self.create_input_op(
149
146
  (self.batch_size, 1, self.seq_len, self.seq_len), dtype=np.float16)
150
147
 
151
- position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
148
+ if self.cached_cos is None:
149
+ if mode == "prefill" and keep_position_ids:
150
+ position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
151
+ cos = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
152
+ dtype=np.float32)
153
+ self.cos = self.convert_to_fp16(cos)
154
+ sin = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
155
+ dtype=np.float32)
156
+ self.sin = self.convert_to_fp16(sin)
157
+ else:
158
+ position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
159
+ cos = self.constant(self.cached_cos)
160
+ self.cos = self.unsqueeze(cos, axis=0)
161
+ sin = self.constant(self.cached_sin)
162
+ self.sin = self.unsqueeze(sin, axis=0)
152
163
 
153
164
  if input_layernorm_weights is None:
154
165
  input_layernorm_weights = []
@@ -211,11 +222,12 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
211
222
  hidden_states = input
212
223
 
213
224
  curr_key_values = []
225
+ cos_condition = cached_cos is not None or (mode == "prefill" and keep_position_ids)
214
226
  for i in range(num_layers):
215
227
  hidden_states, new_key_states, new_value_states = self.build_decoder(
216
228
  hidden_states=hidden_states,
217
229
  attention_mask=attention_mask,
218
- position_ids=position_ids,
230
+ position_ids=position_ids if cos_condition else None,
219
231
  input_layernorm_weight=input_layernorm_weights[i],
220
232
  post_attention_layernorm_weight=post_attn_layernorm_weights[i],
221
233
  q_bias=q_biases[i],
@@ -173,6 +173,105 @@ class LLMEmbedding(NNFactory):
173
173
  self.compile()
174
174
 
175
175
 
176
+ class Llama32Embedding(NNFactory):
177
+ def __init__(
178
+ self,
179
+ vocab_size,
180
+ embedding_dim,
181
+ embedding_weight,
182
+ padding_idx,
183
+ inv_freq,
184
+ attention_scaling,
185
+ dtype, # fp16
186
+ device: str = "NPU",
187
+ ):
188
+ super().__init__(False, device)
189
+ self.vocab_size = vocab_size
190
+ self.embedding_dim = embedding_dim
191
+ self.padding_idx = padding_idx
192
+ self.attention_scaling = attention_scaling
193
+ self.dtype = dtype
194
+
195
+ # define input
196
+ weight = self.constant(embedding_weight)
197
+ input = self.parameter((1, 1), dtype=np.int32)
198
+ position_ids = self.parameter((1, 1), dtype=np.int64)
199
+ inv_freq = self.constant(inv_freq)
200
+
201
+ # embed_tokens module
202
+ if padding_idx == -1:
203
+ padding_idx += vocab_size
204
+
205
+ axis_node = self.constant(np.array([0], dtype=np.int64))
206
+ if padding_idx is not None:
207
+ masked_embeddings = np.ones(weight.shape, dtype=np.float16)
208
+ masked_embeddings[padding_idx, :] = 0.0 # mask
209
+
210
+ node_mask = self.constant(masked_embeddings)
211
+ node_masked_w = self.eltwise_mul(weight, node_mask)
212
+ res = self.gather(node_masked_w, input, axis_node, 0)
213
+ else:
214
+ res = self.gather(weight, input, axis_node, 0)
215
+
216
+ # rotary_emb module
217
+ inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
218
+ position_ids = self.reshape(position_ids, (1, 1, 1))
219
+ freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
220
+ self.convert_to_fp32(position_ids))
221
+ freqs = self.transpose(freqs, [0, 2, 1])
222
+ emb = self.concat(freqs, freqs, axis=2)
223
+ cos = self.cos(emb)
224
+ sin = self.sin(emb)
225
+ cos = cos * self.attention_scaling
226
+ sin = sin * self.attention_scaling
227
+
228
+ # define outputs
229
+ res = self.convert_to_fp16(res)
230
+ cos = self.convert_to_fp32(cos)
231
+ sin = self.convert_to_fp32(sin)
232
+
233
+ print("start compiling")
234
+ self.compile()
235
+
236
+
237
+ class Llama32PostEmbedding(NNFactory):
238
+ def __init__(
239
+ self,
240
+ inv_freq,
241
+ attention_scaling,
242
+ input_len: int = 1,
243
+ device: str = "NPU",
244
+ ):
245
+ super().__init__(False, device)
246
+ self.attention_scaling = attention_scaling
247
+
248
+ # define input
249
+ position_ids = self.parameter((1, input_len), dtype=np.int64)
250
+ inv_freq = self.constant(inv_freq)
251
+
252
+ # rotary_emb module
253
+ inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
254
+ position_ids = self.reshape(position_ids, (1, 1, input_len))
255
+ freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
256
+ self.convert_to_fp32(position_ids))
257
+ freqs = self.transpose(freqs, [0, 2, 1])
258
+ emb = self.concat(freqs, freqs, axis=2)
259
+ cos = self.cos(emb)
260
+ sin = self.sin(emb)
261
+ cos = cos * self.attention_scaling
262
+ sin = sin * self.attention_scaling
263
+ if input_len > 1:
264
+ cos = self.unsqueeze(cos, [1])
265
+ sin = self.unsqueeze(sin, [1])
266
+
267
+ # define outputs
268
+ cos = self.convert_to_fp32(cos)
269
+ sin = self.convert_to_fp32(sin)
270
+
271
+ print("start compiling")
272
+ self.compile()
273
+
274
+
176
275
  def obtain_weight_from_single_layer(attn_layer, mlp_layer):
177
276
  weights = []
178
277
  if hasattr(attn_layer, "q_proj_dq_list"):
@@ -216,3 +315,65 @@ def obtain_qkv_bias_from_single_layer(attn_layer):
216
315
  k_bias = attn_layer.k_proj.bias.to(torch.float16)
217
316
  v_bias = attn_layer.v_proj.bias.to(torch.float16)
218
317
  return q_bias, k_bias, v_bias
318
+
319
+
320
+ def obtain_embedding_from_model(model, convert_model, temp_dir, weight_dir,
321
+ max_prompt_len, keep_ir, compile_blob):
322
+ if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
323
+ # llama-2-7B & llama-3-8B
324
+ embedding_layer = model.model.embed_tokens
325
+ new_embedding = LLMEmbedding(
326
+ vocab_size=model.config.vocab_size,
327
+ embedding_dim=model.config.hidden_size,
328
+ embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
329
+ padding_idx=model.config.pad_token_id,
330
+ dtype=np.float16,
331
+ )
332
+ if convert_model:
333
+ bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
334
+ embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
335
+ first_blob_path = None
336
+ else:
337
+ first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
338
+ temp_dir, keep_ir=keep_ir,
339
+ compile_blob=compile_blob)
340
+ os.remove(os.path.join(temp_dir, "embedding.bin"))
341
+ else:
342
+ # llama-3.2-3B & llama-3.2-1B
343
+ # for transformers >= 4.45.0
344
+ embedding_layer = model.model.embed_tokens
345
+ new_embedding = Llama32Embedding(
346
+ vocab_size=model.config.vocab_size,
347
+ embedding_dim=model.config.hidden_size,
348
+ embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
349
+ padding_idx=model.config.pad_token_id,
350
+ inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
351
+ attention_scaling=model.model.rotary_emb.attention_scaling,
352
+ dtype=np.float16,
353
+ )
354
+ if convert_model:
355
+ bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
356
+ embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
357
+ first_blob_path = None
358
+ # save embedding post module
359
+ inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
360
+ attention_scaling = model.model.rotary_emb.attention_scaling
361
+ embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
362
+ attention_scaling=attention_scaling,
363
+ input_len=1)
364
+ update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
365
+ temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
366
+ embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
367
+ attention_scaling=attention_scaling,
368
+ input_len=max_prompt_len)
369
+ update_names_of_IR_and_export_blob(embedding_post_prefill,
370
+ "embedding_post_prefill",
371
+ temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
372
+ os.remove(os.path.join(temp_dir, "embedding_post.bin"))
373
+ os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
374
+ else:
375
+ first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
376
+ temp_dir, keep_ir=keep_ir,
377
+ compile_blob=compile_blob)
378
+ os.remove(os.path.join(temp_dir, "embedding.bin"))
379
+ return first_blob_path
@@ -31,6 +31,7 @@ import tempfile
31
31
  import numpy as np
32
32
  from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
33
33
  from multiprocessing import Pool
34
+ import transformers
34
35
 
35
36
 
36
37
  def generate(
@@ -456,6 +457,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
456
457
  custom_object_save(model, save_directory, config=model.config)
457
458
 
458
459
  if model.config.model_type == "qwen2":
460
+ cos_sin_input = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
461
+ embedding_post = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
459
462
  if group_size == 0:
460
463
  if model.config.hidden_size == 1536:
461
464
  # Qwen2-1.5B-Instruct
@@ -476,6 +479,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
476
479
  "use_prefill_sdp": False,
477
480
  "weight_num": 7,
478
481
  "weight_idx": 8,
482
+ "embedding_post": embedding_post,
483
+ "cos_sin_input": cos_sin_input,
479
484
  "n_splits_linear": n_splits_linear,
480
485
  "n_splits_down_proj": n_splits_down_proj,
481
486
  "lm_head_low_bit": lm_head_low_bit}
@@ -493,8 +498,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
493
498
  group_size, layernorm_const, "prefill",
494
499
  keep_ir=keep_ir, compile_blob=compile_blob)
495
500
  # save blob of lmhead and bin of embedding
496
- convert_lm_head_and_embedding(model, save_directory, weight_dir,
497
- convert_model=True, group_size=group_size,
501
+ convert_lm_head_and_embedding(model, save_directory, weight_dir, convert_model=True,
502
+ group_size=group_size, max_prompt_len=max_prompt_len,
498
503
  keep_ir=keep_ir, compile_blob=compile_blob)
499
504
  elif model.config.model_type == "llama":
500
505
  embedding_post = False
@@ -18,108 +18,8 @@
18
18
  import torch
19
19
  import numpy as np
20
20
  import os
21
- from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
22
- obtain_weight_from_single_layer
23
- from intel_npu_acceleration_library.backend.factory import NNFactory
24
-
25
-
26
- class Llama32Embedding(NNFactory):
27
- def __init__(
28
- self,
29
- vocab_size,
30
- embedding_dim,
31
- embedding_weight,
32
- padding_idx,
33
- inv_freq,
34
- attention_scaling,
35
- dtype, # fp16
36
- device: str = "NPU",
37
- ):
38
- super().__init__(False, device)
39
- self.vocab_size = vocab_size
40
- self.embedding_dim = embedding_dim
41
- self.padding_idx = padding_idx
42
- self.attention_scaling = attention_scaling
43
- self.dtype = dtype
44
-
45
- # define input
46
- weight = self.constant(embedding_weight)
47
- input = self.parameter((1, 1), dtype=np.int32)
48
- position_ids = self.parameter((1, 1), dtype=np.int64)
49
- inv_freq = self.constant(inv_freq)
50
-
51
- # embed_tokens module
52
- if padding_idx == -1:
53
- padding_idx += vocab_size
54
-
55
- axis_node = self.constant(np.array([0], dtype=np.int64))
56
- if padding_idx is not None:
57
- masked_embeddings = np.ones(weight.shape, dtype=np.float16)
58
- masked_embeddings[padding_idx, :] = 0.0 # mask
59
-
60
- node_mask = self.constant(masked_embeddings)
61
- node_masked_w = self.eltwise_mul(weight, node_mask)
62
- res = self.gather(node_masked_w, input, axis_node, 0)
63
- else:
64
- res = self.gather(weight, input, axis_node, 0)
65
-
66
- # rotary_emb module
67
- inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
68
- position_ids = self.reshape(position_ids, (1, 1, 1))
69
- freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
70
- self.convert_to_fp32(position_ids))
71
- freqs = self.transpose(freqs, [0, 2, 1])
72
- emb = self.concat(freqs, freqs, axis=2)
73
- cos = self.cos(emb)
74
- sin = self.sin(emb)
75
- cos = cos * self.attention_scaling
76
- sin = sin * self.attention_scaling
77
-
78
- # define outputs
79
- res = self.convert_to_fp16(res)
80
- cos = self.convert_to_fp32(cos)
81
- sin = self.convert_to_fp32(sin)
82
-
83
- print("start compiling")
84
- self.compile()
85
-
86
-
87
- class Llama32PostEmbedding(NNFactory):
88
- def __init__(
89
- self,
90
- inv_freq,
91
- attention_scaling,
92
- input_len: int = 1,
93
- device: str = "NPU",
94
- ):
95
- super().__init__(False, device)
96
- self.attention_scaling = attention_scaling
97
-
98
- # define input
99
- position_ids = self.parameter((1, input_len), dtype=np.int64)
100
- inv_freq = self.constant(inv_freq)
101
-
102
- # rotary_emb module
103
- inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
104
- position_ids = self.reshape(position_ids, (1, 1, input_len))
105
- freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
106
- self.convert_to_fp32(position_ids))
107
- freqs = self.transpose(freqs, [0, 2, 1])
108
- emb = self.concat(freqs, freqs, axis=2)
109
- cos = self.cos(emb)
110
- sin = self.sin(emb)
111
- cos = cos * self.attention_scaling
112
- sin = sin * self.attention_scaling
113
- if input_len > 1:
114
- cos = self.unsqueeze(cos, [1])
115
- sin = self.unsqueeze(sin, [1])
116
-
117
- # define outputs
118
- cos = self.convert_to_fp32(cos)
119
- sin = self.convert_to_fp32(sin)
120
-
121
- print("start compiling")
122
- self.compile()
21
+ from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
22
+ obtain_weight_from_single_layer, obtain_embedding_from_model
123
23
 
124
24
 
125
25
  def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
@@ -197,62 +97,10 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
197
97
  bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
198
98
  weight.tofile(bin_file)
199
99
 
200
- if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
201
- # llama-2-7B & llama-3-8B
202
- embedding_layer = model.model.embed_tokens
203
- new_embedding = LLMEmbedding(
204
- vocab_size=model.config.vocab_size,
205
- embedding_dim=model.config.hidden_size,
206
- embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
207
- padding_idx=model.config.pad_token_id,
208
- dtype=np.float16,
209
- )
210
- if convert_model:
211
- bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
212
- embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
213
- first_blob_path = None
214
- else:
215
- first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
216
- temp_dir, keep_ir=keep_ir,
217
- compile_blob=compile_blob)
218
- os.remove(os.path.join(temp_dir, "embedding.bin"))
219
- else:
220
- # llama-3.2-3B & llama-3.2-1B
221
- embedding_layer = model.model.embed_tokens
222
- new_embedding = Llama32Embedding(
223
- vocab_size=model.config.vocab_size,
224
- embedding_dim=model.config.hidden_size,
225
- embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
226
- padding_idx=model.config.pad_token_id,
227
- inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
228
- attention_scaling=model.model.rotary_emb.attention_scaling,
229
- dtype=np.float16,
230
- )
231
- if convert_model:
232
- bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
233
- embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
234
- first_blob_path = None
235
- # save embedding post module
236
- inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
237
- attention_scaling = model.model.rotary_emb.attention_scaling
238
- embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
239
- attention_scaling=attention_scaling,
240
- input_len=1)
241
- update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
242
- temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
243
- embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
244
- attention_scaling=attention_scaling,
245
- input_len=max_prompt_len)
246
- update_names_of_IR_and_export_blob(embedding_post_prefill,
247
- "embedding_post_prefill",
248
- temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
249
- os.remove(os.path.join(temp_dir, "embedding_post.bin"))
250
- os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
251
- else:
252
- first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
253
- temp_dir, keep_ir=keep_ir,
254
- compile_blob=compile_blob)
255
- os.remove(os.path.join(temp_dir, "embedding.bin"))
100
+ first_blob_path = obtain_embedding_from_model(model, convert_model,
101
+ temp_dir, weight_dir,
102
+ max_prompt_len,
103
+ keep_ir, compile_blob)
256
104
 
257
105
  return first_blob_path, last_blob_path
258
106
 
@@ -18,13 +18,14 @@
18
18
  import torch
19
19
  import numpy as np
20
20
  import os
21
- from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
22
- obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer
21
+ from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
22
+ obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer, \
23
+ obtain_embedding_from_model
23
24
  from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
24
25
 
25
26
 
26
27
  def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
27
- convert_model=False, group_size=0,
28
+ convert_model=False, group_size=0, max_prompt_len=1,
28
29
  keep_ir=False, compile_blob=True):
29
30
  num_heads = model.model.layers[0].self_attn.num_heads
30
31
  head_dim = model.model.layers[0].self_attn.head_dim
@@ -107,24 +108,10 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
107
108
  bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
108
109
  weight.tofile(bin_file)
109
110
 
110
- embedding_layer = model.model.embed_tokens
111
- new_embedding = LLMEmbedding(
112
- vocab_size=model.config.vocab_size,
113
- embedding_dim=model.config.hidden_size,
114
- embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
115
- padding_idx=model.config.pad_token_id,
116
- dtype=np.float16,
117
- input_length=1,
118
- )
119
- if convert_model:
120
- bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
121
- embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
122
- first_blob_path = True
123
- else:
124
- first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
125
- temp_dir, keep_ir=keep_ir,
126
- compile_blob=compile_blob)
127
- os.remove(os.path.join(temp_dir, "embedding.bin"))
111
+ first_blob_path = obtain_embedding_from_model(model, convert_model,
112
+ temp_dir, weight_dir,
113
+ max_prompt_len,
114
+ keep_ir, compile_blob)
128
115
  return first_blob_path, last_blob_path
129
116
 
130
117
 
@@ -145,8 +132,13 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
145
132
  mlp_layer = curr_layer.mlp
146
133
  weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
147
134
  q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer)
148
- cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
149
- cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
135
+ if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
136
+ cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
137
+ cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
138
+ else:
139
+ # transformers >= 4.45.0
140
+ cached_cos = None
141
+ cached_sin = None
150
142
  layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
151
143
  layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
152
144
 
@@ -158,10 +150,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
158
150
  if mode == "decode":
159
151
  input_len = 1
160
152
  decoder_name = f"decoder_layer_{layer_idx}"
153
+ keep_position_ids = True
161
154
  npu_dpu_groups = None
162
155
  else:
163
156
  input_len = kv_len
164
157
  decoder_name = "decoder_layer_prefill"
158
+ keep_position_ids = False
165
159
  npu_dpu_groups = 6
166
160
 
167
161
  single_decoder = LowBitQwenMultiDecoderlayer(
@@ -185,6 +179,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
185
179
  n_splits_linear=n_splits_linear,
186
180
  n_splits_down_proj=n_splits_down_proj,
187
181
  group_size=group_size,
182
+ cos_len=input_len,
183
+ keep_position_ids=keep_position_ids,
188
184
  asym=asym
189
185
  )
190
186
  rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
@@ -196,14 +192,25 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
196
192
 
197
193
  # 0, 1, 2 are input_embed/attention_mask/position_id
198
194
  if mode == "decode":
199
- if layernorm_const:
200
- st_idx = 3
195
+ if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
196
+ if layernorm_const:
197
+ st_idx = 3
198
+ else:
199
+ input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
200
+ post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
201
+ layer_norm_0.data.numpy().tofile(input_lm_bin_file)
202
+ layer_norm_1.data.numpy().tofile(post_lm_bin_file)
203
+ st_idx = 5
201
204
  else:
202
- input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
203
- post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
204
- layer_norm_0.data.numpy().tofile(input_lm_bin_file)
205
- layer_norm_1.data.numpy().tofile(post_lm_bin_file)
206
- st_idx = 5
205
+ # transformers >= 4.45.0
206
+ if layernorm_const:
207
+ st_idx = 4
208
+ else:
209
+ input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
210
+ post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_5.bin")
211
+ layer_norm_0.data.numpy().tofile(input_lm_bin_file)
212
+ layer_norm_1.data.numpy().tofile(post_lm_bin_file)
213
+ st_idx = 6
207
214
  q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
208
215
  k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
209
216
  v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
@@ -261,8 +268,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
261
268
  attn_layer = curr_layer.self_attn
262
269
  mlp_layer = curr_layer.mlp
263
270
  weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
264
- cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
265
- cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
271
+ if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
272
+ cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
273
+ cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
274
+ else:
275
+ # transformers >= 4.45.0
276
+ cached_cos = None
277
+ cached_sin = None
266
278
  layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
267
279
  layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
268
280
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ipex-llm
3
- Version: 2.2.0b20250206
3
+ Version: 2.2.0b20250207
4
4
  Summary: Large Language Model Develop Toolkit
5
5
  Home-page: https://github.com/intel-analytics/ipex-llm
6
6
  Author: BigDL Authors
@@ -27,19 +27,12 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
27
27
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
28
28
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
29
29
  Provides-Extra: cpp
30
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250206 ; extra == 'cpp'
30
+ Requires-Dist: bigdl-core-cpp ==2.6.0b20250207 ; extra == 'cpp'
31
31
  Requires-Dist: setuptools ; extra == 'cpp'
32
- Provides-Extra: cpp-arl
33
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250206 ; extra == 'cpp-arl'
34
- Requires-Dist: setuptools ; extra == 'cpp-arl'
35
- Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
36
- Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
37
- Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
38
- Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
39
- Requires-Dist: onednn-devel ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
40
- Requires-Dist: onednn ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
41
- Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
42
- Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
32
+ Requires-Dist: onednn-devel ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
33
+ Requires-Dist: onednn ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
34
+ Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; (platform_system == "Windows") and extra == 'cpp'
35
+ Requires-Dist: mkl-dpcpp ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
43
36
  Provides-Extra: llama-index
44
37
  Requires-Dist: py-cpuinfo ; extra == 'llama-index'
45
38
  Requires-Dist: protobuf ; extra == 'llama-index'
@@ -67,7 +60,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
67
60
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
68
61
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
69
62
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
70
- Requires-Dist: bigdl-core-npu ==2.6.0b20250206 ; (platform_system == "Windows") and extra == 'npu'
63
+ Requires-Dist: bigdl-core-npu ==2.6.0b20250207 ; (platform_system == "Windows") and extra == 'npu'
71
64
  Provides-Extra: serving
72
65
  Requires-Dist: py-cpuinfo ; extra == 'serving'
73
66
  Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -87,9 +80,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
87
80
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
88
81
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
89
82
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
90
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250206 ; extra == 'xpu'
91
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250206 ; extra == 'xpu'
92
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250206 ; extra == 'xpu'
83
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250207 ; extra == 'xpu'
84
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250207 ; extra == 'xpu'
85
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250207 ; extra == 'xpu'
93
86
  Provides-Extra: xpu-2-1
94
87
  Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
95
88
  Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -104,9 +97,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
104
97
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
105
98
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
106
99
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
107
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
108
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
109
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
100
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250207 ; extra == 'xpu-2-1'
101
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250207 ; extra == 'xpu-2-1'
102
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250207 ; extra == 'xpu-2-1'
110
103
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
111
104
  Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
112
105
  Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -124,7 +117,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
124
117
  Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
125
118
  Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
126
119
  Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
127
- Requires-Dist: bigdl-core-xe-all ==2.6.0b20250206 ; extra == 'xpu-2-6'
120
+ Requires-Dist: bigdl-core-xe-all ==2.6.0b20250207 ; extra == 'xpu-2-6'
128
121
  Requires-Dist: onednn-devel ==2025.0.1 ; extra == 'xpu-2-6'
129
122
  Requires-Dist: onednn ==2025.0.1 ; extra == 'xpu-2-6'
130
123
  Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; extra == 'xpu-2-6'
@@ -140,9 +133,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
140
133
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
141
134
  Requires-Dist: tabulate ; extra == 'xpu-arc'
142
135
  Requires-Dist: setuptools ; extra == 'xpu-arc'
143
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
144
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
145
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
136
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250207 ; extra == 'xpu-arc'
137
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250207 ; extra == 'xpu-arc'
138
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250207 ; extra == 'xpu-arc'
146
139
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
147
140
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
148
141
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -163,9 +156,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
163
156
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
164
157
  Requires-Dist: tabulate ; extra == 'xpu-arl'
165
158
  Requires-Dist: setuptools ; extra == 'xpu-arl'
166
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
167
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
168
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
159
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250207 ; extra == 'xpu-arl'
160
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250207 ; extra == 'xpu-arl'
161
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250207 ; extra == 'xpu-arl'
169
162
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
170
163
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
171
164
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -186,9 +179,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
186
179
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
187
180
  Requires-Dist: tabulate ; extra == 'xpu-lnl'
188
181
  Requires-Dist: setuptools ; extra == 'xpu-lnl'
189
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
190
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
191
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
182
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250207 ; extra == 'xpu-lnl'
183
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250207 ; extra == 'xpu-lnl'
184
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250207 ; extra == 'xpu-lnl'
192
185
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
193
186
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
194
187
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
@@ -41,35 +41,35 @@ ipex_llm/langchain/llms/transformerspipelinellm.py,sha256=vm522YPPwWxxAPVvQBtxRf
41
41
  ipex_llm/langchain/vllm/__init__.py,sha256=T-EbRT6GJ_8RCu-iLmSzcftOimXSPQf2d5X72AUAy2Y,874
42
42
  ipex_llm/langchain/vllm/vllm.py,sha256=6dxc-ZISZQrJilEa_HA827l75Dv9rcHpY_G6FdJ8BVs,7793
43
43
  ipex_llm/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
- ipex_llm/libs/bloom-api.dll,sha256=H0S3QMH9mK_VlsEGqqM7vGKNiuvD1j3_cNOloDEqojg,36352
45
- ipex_llm/libs/bloom.dll,sha256=GtBKdhbPz4gZDtzdcrjiIa0IoZighOQmWoaScXlCzGA,507904
46
- ipex_llm/libs/gptneox-api.dll,sha256=_8nji5kq5Z524SGHaElsEFZCkCZJRyjLAbK7dF5EAkE,24576
47
- ipex_llm/libs/gptneox.dll,sha256=su29UwirxxACBTb9rKx4ln5sKsmmG82J7wbIFv9GOQs,568320
48
- ipex_llm/libs/libbloom_avx.dll,sha256=v1lgo7B-JJMWEwgs5hDwkm9XSd0nmO1r1X8JoYaJLIs,536576
49
- ipex_llm/libs/libbloom_vnni.dll,sha256=dBbTV7wWKZKPMw4oZL-H2_ooLdBhwziXLI97xLKvC3w,508416
50
- ipex_llm/libs/libgptneox_avx.dll,sha256=F_JBremk85c6zqKo0_rse9YXo9v_T52jFwy8Nnwt9yg,596992
51
- ipex_llm/libs/libgptneox_vnni.dll,sha256=8A6hc2w5Xqq2MoY_t-El6upUqFuI5Cu-ITiiDv9Nfvg,568832
52
- ipex_llm/libs/libllama_avx.dll,sha256=fADeqa8IK5akM04Cjyd1IRY3Exk8tAuIdNzKBew2zJg,591360
53
- ipex_llm/libs/libllama_vnni.dll,sha256=SbwkJLCQqtIW9zz_QKzAYb5kqfyUSs8-gddMikbB57s,563200
54
- ipex_llm/libs/libstarcoder_avx.dll,sha256=vgvvBkIZ18ofJ9rE69gkNn9SpY025RyI7x2VM0APDWA,627712
55
- ipex_llm/libs/libstarcoder_vnni.dll,sha256=L0cdtY2qHvKpJhFEPl_UkaCVhUw4tcknoIuWbyxQ-ck,599552
56
- ipex_llm/libs/llama-api.dll,sha256=7yQHdnnFcNiHESH3nrGLyEWscKV9FTPWmDqk-Gf9bA8,25600
57
- ipex_llm/libs/llama.dll,sha256=pOUGsXP8_NP1byv7z_Q-JU2flWnTjYlCL6lbU-RvORw,562688
58
- ipex_llm/libs/main-bloom.exe,sha256=bK5DfBLbt4jHwdPl0hw1zaBGQHFWC9MFjiDRqCXFgFA,103424
59
- ipex_llm/libs/main-gptneox.exe,sha256=3OfGBYDzOpYeB6GxToauh8af4M8i6l4Z6ffYQPdKyIw,98816
60
- ipex_llm/libs/main-llama.exe,sha256=wZGa8lG3bfaEQi8-DvRC4D3sjMKXms1pwT9OXVME4_Y,99840
61
- ipex_llm/libs/main-starcoder.exe,sha256=3yZrYUpJ1FYOWCh6PNmWagQ5e6BmimlL25B6AiPmQys,157696
62
- ipex_llm/libs/pipeline.dll,sha256=uDPNVk7J_dvOX_NTAJs6AEtm5pAnwYLuczHYuTV6Pso,72704
63
- ipex_llm/libs/quantize-bloom.exe,sha256=6Rl2TEE9-FN0jHrcAYsZjfp0kAxzMoHKuvM31d8pzPs,126464
64
- ipex_llm/libs/quantize-bloom_vnni.exe,sha256=7Q20DE84l-CDxcVgUxzWspAh0faioQw2iJqdtk9JME4,128000
65
- ipex_llm/libs/quantize-gptneox.exe,sha256=QRxEqJYH3ShD6KLhW3guxM_SxPusFADvv8j5euhp53Q,104448
66
- ipex_llm/libs/quantize-gptneox_vnni.exe,sha256=fj2E8ChFakQQzHHpYb_UxNy-9yQ9ZbChhr5PYUCdWkw,104960
67
- ipex_llm/libs/quantize-llama.exe,sha256=v2Rq663-92bMV3ze9l2-ocxvSjTeqlJegfY5XLf4MRQ,110080
68
- ipex_llm/libs/quantize-llama_vnni.exe,sha256=ywCgkuUA_jBImNslFpLFdcUvGv2pcbRvRZyZBhJ6-4c,110592
69
- ipex_llm/libs/quantize-starcoder.exe,sha256=zbiQpagpWGSYqgyHEmHgglDen3nDUS1LyhUXJbt65wE,127488
70
- ipex_llm/libs/quantize-starcoder_vnni.exe,sha256=xtynT6qbnZ1nBRxsIQbi2JGSKOlvdSCCozDQJiDSwCg,128512
71
- ipex_llm/libs/starcoder-api.dll,sha256=7c7MWBv57ZhfiynSrYJIKhnE6HMXUTSYYTUGwD7BX9I,21504
72
- ipex_llm/libs/starcoder.dll,sha256=AzASEAh2HCDC9XIQ0JfUiUDqF-3p4KR3rF71MKQDA4k,599040
44
+ ipex_llm/libs/bloom-api.dll,sha256=R0zcv1M0D8y8inrrCUO2xCSTRb0IChVyLa6YQo9zne8,36352
45
+ ipex_llm/libs/bloom.dll,sha256=eBzUhLMeOAb9InMPp9_KC5VhJC9F-YKNlJn6HyfOAb0,507904
46
+ ipex_llm/libs/gptneox-api.dll,sha256=9_mq8IntnMiU7-_kDxiLojnEc1nu3rrxZZAIes7Nd4k,24576
47
+ ipex_llm/libs/gptneox.dll,sha256=kR3dyhN7tNUxVIWoqudW57V0MIGqr-Mxkmw7kwR8VWs,568320
48
+ ipex_llm/libs/libbloom_avx.dll,sha256=0iRHd_QIzEG_NI0RkFKmCX_HG-3E21t33sxrmbCpQwo,536576
49
+ ipex_llm/libs/libbloom_vnni.dll,sha256=dL1TzKzoki8KDsCmka6QfzBH24T06WokxT3F4M5a3lk,508416
50
+ ipex_llm/libs/libgptneox_avx.dll,sha256=SPi9xXxB5jLp63CfgVhmMA-rCoyCCji2nuWz-rv5y3E,596992
51
+ ipex_llm/libs/libgptneox_vnni.dll,sha256=NV3xykgHJGxNTDWAA_yhwlBG_dbHPX0__5s9uHCPmfc,568832
52
+ ipex_llm/libs/libllama_avx.dll,sha256=EbZ-lpHHtM-zS9aiuDU8cBVueVAtRi3UqerARH41qC8,591360
53
+ ipex_llm/libs/libllama_vnni.dll,sha256=67XqNSyXI1nuaA1-xcSOhYIHZaH7aZBvwMetGpTriIk,563200
54
+ ipex_llm/libs/libstarcoder_avx.dll,sha256=kAqXHfoZfmyqIbNbGpzQjXNCMz9pkG5KVRECzEDEwhM,627712
55
+ ipex_llm/libs/libstarcoder_vnni.dll,sha256=c02B9jpBvST282jRXJtkRwJKkZnzhkz5MLdFfjH9T8I,599552
56
+ ipex_llm/libs/llama-api.dll,sha256=SA2frHXocsnAN9z3LZfWT_FjY1waSMS26bHM6ot_07c,25600
57
+ ipex_llm/libs/llama.dll,sha256=Ls7CKimo2SNy-uJt6lLz16yz1O9E358dRgP8E0svF98,562688
58
+ ipex_llm/libs/main-bloom.exe,sha256=-HCik31DRGrozp_Uy420O1l-Sk_7e9V1bjg4XaLPFvA,103424
59
+ ipex_llm/libs/main-gptneox.exe,sha256=pqxQCGKBrsoDtvuKhCwk6uOAGt4GGvzoAdQbHB9qrFI,98816
60
+ ipex_llm/libs/main-llama.exe,sha256=sPKj3WRmI97jyNhO4A5Lz4eF-tsZZojv6z2VaNzAKAU,99840
61
+ ipex_llm/libs/main-starcoder.exe,sha256=7vyW8v2qO1J_fkRq4uzk44UsV4AhDGmcWHUwMiez8WY,157696
62
+ ipex_llm/libs/pipeline.dll,sha256=vHFtLO6vUZQVwtzXICv1Q5Ork32Dw5Ipqa8pbr6TtmM,72704
63
+ ipex_llm/libs/quantize-bloom.exe,sha256=8rUxXU7Z4AZ7mFHI3sGpwGG18_DkapunwTzzUTjCCbo,126464
64
+ ipex_llm/libs/quantize-bloom_vnni.exe,sha256=gA9kKUkmFOIzT_CmFFvG-fG6d6bZuEWSTeyPvhCsDLs,128000
65
+ ipex_llm/libs/quantize-gptneox.exe,sha256=YsrviyLjQU9uxD1p6TfdBAPXG72-QzZFGpt7lDmK_gM,104448
66
+ ipex_llm/libs/quantize-gptneox_vnni.exe,sha256=mYmUHza3rZztjTogXv9FxuIM20z0gHfyjbF6b6ADEK0,104960
67
+ ipex_llm/libs/quantize-llama.exe,sha256=h-7nbo0uIswViTdxf_vHmE3sZdnQ79dDMUHzqjtyMKs,110080
68
+ ipex_llm/libs/quantize-llama_vnni.exe,sha256=OEPzGySIaa-O9IhPY-u2slHnhMDzp6mL8e_Qr2WUgKc,110592
69
+ ipex_llm/libs/quantize-starcoder.exe,sha256=4U-jT0MC4Iz4kP_6WpKkMOSk_hTlqAwgSVlGLGa-imA,127488
70
+ ipex_llm/libs/quantize-starcoder_vnni.exe,sha256=Xc4jW9KH_RNSfJIYJinDRIx-BbWmqxx4h-kc9jowZpk,128512
71
+ ipex_llm/libs/starcoder-api.dll,sha256=2lF73SE1AyICwtpQSZUfkiAbE1WJQ5gEbikL1Lsvzhg,21504
72
+ ipex_llm/libs/starcoder.dll,sha256=NBh51OQS90ppaqMAJAFCa6HptcUnnPx7tUL1J95QwMk,599040
73
73
  ipex_llm/llamaindex/__init__.py,sha256=T-EbRT6GJ_8RCu-iLmSzcftOimXSPQf2d5X72AUAy2Y,874
74
74
  ipex_llm/llamaindex/llms/__init__.py,sha256=KP1lEdGqDuxPoxL1ZSH25Pm2kKMPJBWUTLR0ckSLMIU,1139
75
75
  ipex_llm/llamaindex/llms/bigdlllm.py,sha256=FQBzq1KOjfc6uofTXAha3O7TqpJkNfOFepXQmOVlbnI,26314
@@ -95,9 +95,9 @@ ipex_llm/transformers/lisa.py,sha256=F5WxbtXQ7RdKulj83h_2DnEIgKiKGZf7zvOmg6QBl2s
95
95
  ipex_llm/transformers/loader.py,sha256=AwjV5RpI2t2bedlv7ZhLm8cfd-QJZm5hny-XyjIvdnk,6876
96
96
  ipex_llm/transformers/lookup.py,sha256=b6OlZ9OV10R9qeWw8mVryVpDxszkjwLkldvi7GPMJY8,19614
97
97
  ipex_llm/transformers/low_bit_linear.py,sha256=3EtbiCAq5HU_r2pGJ9beSDK4NPTN8Jj-aHMqm1jqX18,39177
98
- ipex_llm/transformers/model.py,sha256=cQJNlAkdfoWmVbWd-TS2hf-Do41mMO9orPvG3FO4Nns,40855
98
+ ipex_llm/transformers/model.py,sha256=FyHrEQhkHxG3FbGkhTjVOP2rgFMjc3AXcjDwvvB0HqU,40798
99
99
  ipex_llm/transformers/modelling_bigdl.py,sha256=7JpNVMuyq_OmtNUaMFMXdxPWZp2q0QHC02QeA-VTPOw,6709
100
- ipex_llm/transformers/npu_model.py,sha256=LMmRmhq8IAN9FrXLUeUK2B8XS2OJ5GVWmG0cEdeK-ro,40354
100
+ ipex_llm/transformers/npu_model.py,sha256=zgXOiLIJ-3p-1Kejgv4jUFK8OiBZbezMZrRyn0_6_8c,40306
101
101
  ipex_llm/transformers/patches.py,sha256=G9KcXxo42H1HJEDaroq4JbBN5P0P0lty7U7kk7-g4tw,991
102
102
  ipex_llm/transformers/pipeline_parallel.py,sha256=uNZpOXljNmdoEYnP8U-VFiN4dRZb2piQbIf2bG9LQnE,49051
103
103
  ipex_llm/transformers/qlora.py,sha256=qV9Y6G5kAaet77LLA3oXn3qQY4ayyAPZ7NAjOlHCS7g,14967
@@ -202,17 +202,17 @@ ipex_llm/transformers/npu_models/paraformer_mp.py,sha256=lGEjmKHW_Pk3BE3nqa1ZVgJ
202
202
  ipex_llm/transformers/npu_models/phi3.py,sha256=R-EuqHsTrPTX33HtCGAMFlRdXB_j5mH_7FDnj62JtNM,6555
203
203
  ipex_llm/transformers/npu_models/phi3_v.py,sha256=EMZuTPkGfuDVp9c5BU1HyzXHWKswHRQ8bvQjzocIyHA,7737
204
204
  ipex_llm/transformers/npu_models/qwen2.py,sha256=RDNtPK8kxMk3z8A4S53saTrw2klgkzo4oa7voJLwr1o,12085
205
- ipex_llm/transformers/npu_models/qwen2_mp.py,sha256=dnxpkLVW2bUsL4V-kZTyT2itc5aOpIB5vP3U7FtWdrs,44184
205
+ ipex_llm/transformers/npu_models/qwen2_mp.py,sha256=EKiI80rnQ43WUF_2wWCy75mx-rbjAbRQSB49OgjZFNo,45003
206
206
  ipex_llm/transformers/npu_models/stablelm.py,sha256=0iUhdjFqFd0svuTd09wP60mbEtobPkNSj-1I1vfuhsU,7778
207
207
  ipex_llm/transformers/npu_models/xlm_mp.py,sha256=sj8OVun8xJprM7ZJp0XzWa55rqlSIzNMdKmI9i6jlDU,28332
208
208
  ipex_llm/transformers/npu_pipeline_model/__init__.py,sha256=b2IXvVqQ5cItki021h8s3ymW12RPu8QNPprq4Mn3bDM,586
209
209
  ipex_llm/transformers/npu_pipeline_model/baichuan.py,sha256=ICxRzFQ4OIANDkkVi2_4xOeQXmfFXYMx3H52KuE1xR4,6208
210
- ipex_llm/transformers/npu_pipeline_model/common.py,sha256=QxJoJESpv0BpwO_FBeAT2wKA56wNFfen8iI37PrMKuA,7838
211
- ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256=-eHNbRuX2QhYd0-jCyo2pZpHTZTZ108bhObYx8a3CJs,29494
212
- ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=pmAnawfAn0W8XSr8kGWfxR1HylCLa-Y6mKpFeX-m8UY,20892
210
+ ipex_llm/transformers/npu_pipeline_model/common.py,sha256=faooJmM75qnVyZYuQLx9gJpVlotcVF4qXRCnOrknfk4,14776
211
+ ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256=_l4RFmyBMbREo8vzKpHXAMtE202JVQ41Y2lPg1qCOMI,29846
212
+ ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=j2sipfFSrzV2VgLKPOClMHwWIDXqDsL1jIQJK25hneo,14397
213
213
  ipex_llm/transformers/npu_pipeline_model/minicpm.py,sha256=H7j_UaHj-IwEBriQ-bunle0-8s2NmvqnL9eYuixnmFc,21398
214
214
  ipex_llm/transformers/npu_pipeline_model/pipeline_cpp.py,sha256=JNmodAMg_NQvDILug3E_fGXEh6cd3wsj4bvAzcd-vaU,2749
215
- ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=FAfoPlKEAxeU6-J8ltpSev5ithm9AC-urtreu6NGpME,15509
215
+ ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=6MNtCL1CXoR19B4tKZSgv2e5gtma9bqDG7DOYMCnPt0,16013
216
216
  ipex_llm/utils/__init__.py,sha256=LlUgrD03rfw4iY8zWPtHH6p65Gw76waVOLHaqagETw0,1425
217
217
  ipex_llm/utils/benchmark_util_4_29.py,sha256=OU1W1quiaiJGsg1pd3HM9O6PmVSaPA0HHE7R8hNTfmQ,258653
218
218
  ipex_llm/utils/benchmark_util_4_42.py,sha256=HEiClCgKDp_T64HH8ulSTly8dvt6UwPDYZfrPVYvXcc,225383
@@ -248,11 +248,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
248
248
  ipex_llm/vllm/xpu/engine/engine.py,sha256=k4-D27WS_Gk3mA--w3HWAjPjb4Aiu043MVPi0ZoAUBc,5984
249
249
  ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=GshTZFB8e4PWvqckfbmTOU6b0oLkNn7A-vzLuG9--j8,21544
250
250
  ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=2rENA2ucynMaIjiZBEh2ez1o5vR32GaP514t39CD7KM,8676
251
- ipex_llm-2.2.0b20250206.data/scripts/ipex-llm-init.bat,sha256=HPtCYuDYwEatq7dAwOvdfVcHYCpAVdbj75K1qh0vQek,2578
252
- ipex_llm-2.2.0b20250206.data/scripts/llm-chat.ps1,sha256=6qrs-hGVAV8IKh7Jx8nq_XrnZcjd7qGU5wndArM7Yag,2769
253
- ipex_llm-2.2.0b20250206.data/scripts/llm-cli.ps1,sha256=3qBtTLs_EjYDnM8YyCpJhzLnGCKTEGssu9UNqfkjVXs,3009
254
- ipex_llm-2.2.0b20250206.dist-info/METADATA,sha256=pAr_-dBEJB_J2lV8oNgJkJ5bGTObiseNHISkXAGkY9I,12879
255
- ipex_llm-2.2.0b20250206.dist-info/WHEEL,sha256=6iYPr8vTHsyDK75jr9X0V3I9wPSVmtwr_8fdATBciGk,98
256
- ipex_llm-2.2.0b20250206.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
257
- ipex_llm-2.2.0b20250206.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
258
- ipex_llm-2.2.0b20250206.dist-info/RECORD,,
251
+ ipex_llm-2.2.0b20250207.data/scripts/ipex-llm-init.bat,sha256=HPtCYuDYwEatq7dAwOvdfVcHYCpAVdbj75K1qh0vQek,2578
252
+ ipex_llm-2.2.0b20250207.data/scripts/llm-chat.ps1,sha256=6qrs-hGVAV8IKh7Jx8nq_XrnZcjd7qGU5wndArM7Yag,2769
253
+ ipex_llm-2.2.0b20250207.data/scripts/llm-cli.ps1,sha256=3qBtTLs_EjYDnM8YyCpJhzLnGCKTEGssu9UNqfkjVXs,3009
254
+ ipex_llm-2.2.0b20250207.dist-info/METADATA,sha256=d1hx5hE5Xeb3lHGWqeF35SK9GZOX6syXJ_Syu5b35IU,12369
255
+ ipex_llm-2.2.0b20250207.dist-info/WHEEL,sha256=6iYPr8vTHsyDK75jr9X0V3I9wPSVmtwr_8fdATBciGk,98
256
+ ipex_llm-2.2.0b20250207.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
257
+ ipex_llm-2.2.0b20250207.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
258
+ ipex_llm-2.2.0b20250207.dist-info/RECORD,,