ipex-llm 2.2.0b20250206__py3-none-win_amd64.whl → 2.2.0b20250207__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/model.py +0 -1
- ipex_llm/transformers/npu_model.py +0 -1
- ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
- ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
- ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +7 -2
- ipex_llm/transformers/npu_pipeline_model/llama.py +6 -158
- ipex_llm/transformers/npu_pipeline_model/qwen.py +44 -32
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/METADATA +23 -30
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/RECORD +44 -44
- {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250207.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250206.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250206.dist-info → ipex_llm-2.2.0b20250207.dist-info}/top_level.txt +0 -0
ipex_llm/libs/bloom-api.dll
CHANGED
Binary file
|
ipex_llm/libs/bloom.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox-api.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_vnni.dll
CHANGED
Binary file
|
ipex_llm/libs/libgptneox_avx.dll
CHANGED
Binary file
|
Binary file
|
ipex_llm/libs/libllama_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libllama_vnni.dll
CHANGED
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/llama-api.dll
CHANGED
Binary file
|
ipex_llm/libs/llama.dll
CHANGED
Binary file
|
ipex_llm/libs/main-bloom.exe
CHANGED
Binary file
|
ipex_llm/libs/main-gptneox.exe
CHANGED
Binary file
|
ipex_llm/libs/main-llama.exe
CHANGED
Binary file
|
ipex_llm/libs/main-starcoder.exe
CHANGED
Binary file
|
ipex_llm/libs/pipeline.dll
CHANGED
Binary file
|
ipex_llm/libs/quantize-bloom.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/quantize-llama.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/starcoder-api.dll
CHANGED
Binary file
|
ipex_llm/libs/starcoder.dll
CHANGED
Binary file
|
ipex_llm/transformers/model.py
CHANGED
@@ -233,7 +233,6 @@ class _BaseAutoModelClass:
|
|
233
233
|
optimize_model = False
|
234
234
|
kwargs["modules_to_not_convert"] = ["lm_head"]
|
235
235
|
|
236
|
-
load_in_8bit = kwargs.pop("load_in_8bit", False)
|
237
236
|
from ipex_llm.llm_patching import bigdl_patched
|
238
237
|
if bigdl_patched == 'Train':
|
239
238
|
global patched_training_mode
|
@@ -117,7 +117,6 @@ class _BaseAutoModelClass:
|
|
117
117
|
# ignore following arguments
|
118
118
|
ignore_argument(kwargs, "model_hub")
|
119
119
|
ignore_argument(kwargs, "load_in_4bit")
|
120
|
-
ignore_argument(kwargs, "load_in_8bit")
|
121
120
|
ignore_argument(kwargs, "imatrix")
|
122
121
|
ignore_argument(kwargs, "cpu_embedding")
|
123
122
|
ignore_argument(kwargs, "embedding_qtype")
|
@@ -98,6 +98,8 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|
98
98
|
n_splits_linear: int = 1,
|
99
99
|
n_splits_down_proj: int = 1,
|
100
100
|
group_size: int = 0,
|
101
|
+
cos_len: int = 1,
|
102
|
+
keep_position_ids=True,
|
101
103
|
asym: bool = False,
|
102
104
|
):
|
103
105
|
super().__init__(max_seq_len=max_seq_len,
|
@@ -114,18 +116,13 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|
114
116
|
self.dtype = dtype
|
115
117
|
self.cached_cos = cached_cos
|
116
118
|
self.cached_sin = cached_sin
|
119
|
+
self.cos_len = cos_len
|
117
120
|
self.batch_size, self.seq_len, self.hidden_size = hidden_shape
|
118
121
|
self.mode = mode
|
119
122
|
self.rms_norm_eps = rms_norm_eps
|
120
123
|
self.transpose_value = transpose_value
|
121
124
|
self.num_layers = num_layers
|
122
125
|
|
123
|
-
cos = self.constant(self.cached_cos)
|
124
|
-
self.cos = self.unsqueeze(cos, axis=0)
|
125
|
-
|
126
|
-
sin = self.constant(self.cached_sin)
|
127
|
-
self.sin = self.unsqueeze(sin, axis=0)
|
128
|
-
|
129
126
|
if mode == "decode":
|
130
127
|
self.kv_seq_len = self.max_seq_len + 1
|
131
128
|
else:
|
@@ -148,7 +145,21 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|
148
145
|
attention_mask = self.create_input_op(
|
149
146
|
(self.batch_size, 1, self.seq_len, self.seq_len), dtype=np.float16)
|
150
147
|
|
151
|
-
|
148
|
+
if self.cached_cos is None:
|
149
|
+
if mode == "prefill" and keep_position_ids:
|
150
|
+
position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
|
151
|
+
cos = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
|
152
|
+
dtype=np.float32)
|
153
|
+
self.cos = self.convert_to_fp16(cos)
|
154
|
+
sin = self.create_input_op((self.batch_size, self.cos_len, self.head_dim),
|
155
|
+
dtype=np.float32)
|
156
|
+
self.sin = self.convert_to_fp16(sin)
|
157
|
+
else:
|
158
|
+
position_ids = self.create_input_op((self.batch_size, self.seq_len), dtype=np.int64)
|
159
|
+
cos = self.constant(self.cached_cos)
|
160
|
+
self.cos = self.unsqueeze(cos, axis=0)
|
161
|
+
sin = self.constant(self.cached_sin)
|
162
|
+
self.sin = self.unsqueeze(sin, axis=0)
|
152
163
|
|
153
164
|
if input_layernorm_weights is None:
|
154
165
|
input_layernorm_weights = []
|
@@ -211,11 +222,12 @@ class LowBitQwenMultiDecoderlayer(LLMBaseNNFactory):
|
|
211
222
|
hidden_states = input
|
212
223
|
|
213
224
|
curr_key_values = []
|
225
|
+
cos_condition = cached_cos is not None or (mode == "prefill" and keep_position_ids)
|
214
226
|
for i in range(num_layers):
|
215
227
|
hidden_states, new_key_states, new_value_states = self.build_decoder(
|
216
228
|
hidden_states=hidden_states,
|
217
229
|
attention_mask=attention_mask,
|
218
|
-
position_ids=position_ids,
|
230
|
+
position_ids=position_ids if cos_condition else None,
|
219
231
|
input_layernorm_weight=input_layernorm_weights[i],
|
220
232
|
post_attention_layernorm_weight=post_attn_layernorm_weights[i],
|
221
233
|
q_bias=q_biases[i],
|
@@ -173,6 +173,105 @@ class LLMEmbedding(NNFactory):
|
|
173
173
|
self.compile()
|
174
174
|
|
175
175
|
|
176
|
+
class Llama32Embedding(NNFactory):
|
177
|
+
def __init__(
|
178
|
+
self,
|
179
|
+
vocab_size,
|
180
|
+
embedding_dim,
|
181
|
+
embedding_weight,
|
182
|
+
padding_idx,
|
183
|
+
inv_freq,
|
184
|
+
attention_scaling,
|
185
|
+
dtype, # fp16
|
186
|
+
device: str = "NPU",
|
187
|
+
):
|
188
|
+
super().__init__(False, device)
|
189
|
+
self.vocab_size = vocab_size
|
190
|
+
self.embedding_dim = embedding_dim
|
191
|
+
self.padding_idx = padding_idx
|
192
|
+
self.attention_scaling = attention_scaling
|
193
|
+
self.dtype = dtype
|
194
|
+
|
195
|
+
# define input
|
196
|
+
weight = self.constant(embedding_weight)
|
197
|
+
input = self.parameter((1, 1), dtype=np.int32)
|
198
|
+
position_ids = self.parameter((1, 1), dtype=np.int64)
|
199
|
+
inv_freq = self.constant(inv_freq)
|
200
|
+
|
201
|
+
# embed_tokens module
|
202
|
+
if padding_idx == -1:
|
203
|
+
padding_idx += vocab_size
|
204
|
+
|
205
|
+
axis_node = self.constant(np.array([0], dtype=np.int64))
|
206
|
+
if padding_idx is not None:
|
207
|
+
masked_embeddings = np.ones(weight.shape, dtype=np.float16)
|
208
|
+
masked_embeddings[padding_idx, :] = 0.0 # mask
|
209
|
+
|
210
|
+
node_mask = self.constant(masked_embeddings)
|
211
|
+
node_masked_w = self.eltwise_mul(weight, node_mask)
|
212
|
+
res = self.gather(node_masked_w, input, axis_node, 0)
|
213
|
+
else:
|
214
|
+
res = self.gather(weight, input, axis_node, 0)
|
215
|
+
|
216
|
+
# rotary_emb module
|
217
|
+
inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
|
218
|
+
position_ids = self.reshape(position_ids, (1, 1, 1))
|
219
|
+
freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
|
220
|
+
self.convert_to_fp32(position_ids))
|
221
|
+
freqs = self.transpose(freqs, [0, 2, 1])
|
222
|
+
emb = self.concat(freqs, freqs, axis=2)
|
223
|
+
cos = self.cos(emb)
|
224
|
+
sin = self.sin(emb)
|
225
|
+
cos = cos * self.attention_scaling
|
226
|
+
sin = sin * self.attention_scaling
|
227
|
+
|
228
|
+
# define outputs
|
229
|
+
res = self.convert_to_fp16(res)
|
230
|
+
cos = self.convert_to_fp32(cos)
|
231
|
+
sin = self.convert_to_fp32(sin)
|
232
|
+
|
233
|
+
print("start compiling")
|
234
|
+
self.compile()
|
235
|
+
|
236
|
+
|
237
|
+
class Llama32PostEmbedding(NNFactory):
|
238
|
+
def __init__(
|
239
|
+
self,
|
240
|
+
inv_freq,
|
241
|
+
attention_scaling,
|
242
|
+
input_len: int = 1,
|
243
|
+
device: str = "NPU",
|
244
|
+
):
|
245
|
+
super().__init__(False, device)
|
246
|
+
self.attention_scaling = attention_scaling
|
247
|
+
|
248
|
+
# define input
|
249
|
+
position_ids = self.parameter((1, input_len), dtype=np.int64)
|
250
|
+
inv_freq = self.constant(inv_freq)
|
251
|
+
|
252
|
+
# rotary_emb module
|
253
|
+
inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
|
254
|
+
position_ids = self.reshape(position_ids, (1, 1, input_len))
|
255
|
+
freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
|
256
|
+
self.convert_to_fp32(position_ids))
|
257
|
+
freqs = self.transpose(freqs, [0, 2, 1])
|
258
|
+
emb = self.concat(freqs, freqs, axis=2)
|
259
|
+
cos = self.cos(emb)
|
260
|
+
sin = self.sin(emb)
|
261
|
+
cos = cos * self.attention_scaling
|
262
|
+
sin = sin * self.attention_scaling
|
263
|
+
if input_len > 1:
|
264
|
+
cos = self.unsqueeze(cos, [1])
|
265
|
+
sin = self.unsqueeze(sin, [1])
|
266
|
+
|
267
|
+
# define outputs
|
268
|
+
cos = self.convert_to_fp32(cos)
|
269
|
+
sin = self.convert_to_fp32(sin)
|
270
|
+
|
271
|
+
print("start compiling")
|
272
|
+
self.compile()
|
273
|
+
|
274
|
+
|
176
275
|
def obtain_weight_from_single_layer(attn_layer, mlp_layer):
|
177
276
|
weights = []
|
178
277
|
if hasattr(attn_layer, "q_proj_dq_list"):
|
@@ -216,3 +315,65 @@ def obtain_qkv_bias_from_single_layer(attn_layer):
|
|
216
315
|
k_bias = attn_layer.k_proj.bias.to(torch.float16)
|
217
316
|
v_bias = attn_layer.v_proj.bias.to(torch.float16)
|
218
317
|
return q_bias, k_bias, v_bias
|
318
|
+
|
319
|
+
|
320
|
+
def obtain_embedding_from_model(model, convert_model, temp_dir, weight_dir,
|
321
|
+
max_prompt_len, keep_ir, compile_blob):
|
322
|
+
if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
|
323
|
+
# llama-2-7B & llama-3-8B
|
324
|
+
embedding_layer = model.model.embed_tokens
|
325
|
+
new_embedding = LLMEmbedding(
|
326
|
+
vocab_size=model.config.vocab_size,
|
327
|
+
embedding_dim=model.config.hidden_size,
|
328
|
+
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
329
|
+
padding_idx=model.config.pad_token_id,
|
330
|
+
dtype=np.float16,
|
331
|
+
)
|
332
|
+
if convert_model:
|
333
|
+
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
334
|
+
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
335
|
+
first_blob_path = None
|
336
|
+
else:
|
337
|
+
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
338
|
+
temp_dir, keep_ir=keep_ir,
|
339
|
+
compile_blob=compile_blob)
|
340
|
+
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
341
|
+
else:
|
342
|
+
# llama-3.2-3B & llama-3.2-1B
|
343
|
+
# for transformers >= 4.45.0
|
344
|
+
embedding_layer = model.model.embed_tokens
|
345
|
+
new_embedding = Llama32Embedding(
|
346
|
+
vocab_size=model.config.vocab_size,
|
347
|
+
embedding_dim=model.config.hidden_size,
|
348
|
+
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
349
|
+
padding_idx=model.config.pad_token_id,
|
350
|
+
inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
|
351
|
+
attention_scaling=model.model.rotary_emb.attention_scaling,
|
352
|
+
dtype=np.float16,
|
353
|
+
)
|
354
|
+
if convert_model:
|
355
|
+
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
356
|
+
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
357
|
+
first_blob_path = None
|
358
|
+
# save embedding post module
|
359
|
+
inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
|
360
|
+
attention_scaling = model.model.rotary_emb.attention_scaling
|
361
|
+
embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
|
362
|
+
attention_scaling=attention_scaling,
|
363
|
+
input_len=1)
|
364
|
+
update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
|
365
|
+
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
366
|
+
embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
|
367
|
+
attention_scaling=attention_scaling,
|
368
|
+
input_len=max_prompt_len)
|
369
|
+
update_names_of_IR_and_export_blob(embedding_post_prefill,
|
370
|
+
"embedding_post_prefill",
|
371
|
+
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
372
|
+
os.remove(os.path.join(temp_dir, "embedding_post.bin"))
|
373
|
+
os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
|
374
|
+
else:
|
375
|
+
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
376
|
+
temp_dir, keep_ir=keep_ir,
|
377
|
+
compile_blob=compile_blob)
|
378
|
+
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
379
|
+
return first_blob_path
|
@@ -31,6 +31,7 @@ import tempfile
|
|
31
31
|
import numpy as np
|
32
32
|
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
|
33
33
|
from multiprocessing import Pool
|
34
|
+
import transformers
|
34
35
|
|
35
36
|
|
36
37
|
def generate(
|
@@ -456,6 +457,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
456
457
|
custom_object_save(model, save_directory, config=model.config)
|
457
458
|
|
458
459
|
if model.config.model_type == "qwen2":
|
460
|
+
cos_sin_input = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
|
461
|
+
embedding_post = not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached")
|
459
462
|
if group_size == 0:
|
460
463
|
if model.config.hidden_size == 1536:
|
461
464
|
# Qwen2-1.5B-Instruct
|
@@ -476,6 +479,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
476
479
|
"use_prefill_sdp": False,
|
477
480
|
"weight_num": 7,
|
478
481
|
"weight_idx": 8,
|
482
|
+
"embedding_post": embedding_post,
|
483
|
+
"cos_sin_input": cos_sin_input,
|
479
484
|
"n_splits_linear": n_splits_linear,
|
480
485
|
"n_splits_down_proj": n_splits_down_proj,
|
481
486
|
"lm_head_low_bit": lm_head_low_bit}
|
@@ -493,8 +498,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
|
|
493
498
|
group_size, layernorm_const, "prefill",
|
494
499
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
495
500
|
# save blob of lmhead and bin of embedding
|
496
|
-
convert_lm_head_and_embedding(model, save_directory, weight_dir,
|
497
|
-
|
501
|
+
convert_lm_head_and_embedding(model, save_directory, weight_dir, convert_model=True,
|
502
|
+
group_size=group_size, max_prompt_len=max_prompt_len,
|
498
503
|
keep_ir=keep_ir, compile_blob=compile_blob)
|
499
504
|
elif model.config.model_type == "llama":
|
500
505
|
embedding_post = False
|
@@ -18,108 +18,8 @@
|
|
18
18
|
import torch
|
19
19
|
import numpy as np
|
20
20
|
import os
|
21
|
-
from .common import update_names_of_IR_and_export_blob,
|
22
|
-
obtain_weight_from_single_layer
|
23
|
-
from intel_npu_acceleration_library.backend.factory import NNFactory
|
24
|
-
|
25
|
-
|
26
|
-
class Llama32Embedding(NNFactory):
|
27
|
-
def __init__(
|
28
|
-
self,
|
29
|
-
vocab_size,
|
30
|
-
embedding_dim,
|
31
|
-
embedding_weight,
|
32
|
-
padding_idx,
|
33
|
-
inv_freq,
|
34
|
-
attention_scaling,
|
35
|
-
dtype, # fp16
|
36
|
-
device: str = "NPU",
|
37
|
-
):
|
38
|
-
super().__init__(False, device)
|
39
|
-
self.vocab_size = vocab_size
|
40
|
-
self.embedding_dim = embedding_dim
|
41
|
-
self.padding_idx = padding_idx
|
42
|
-
self.attention_scaling = attention_scaling
|
43
|
-
self.dtype = dtype
|
44
|
-
|
45
|
-
# define input
|
46
|
-
weight = self.constant(embedding_weight)
|
47
|
-
input = self.parameter((1, 1), dtype=np.int32)
|
48
|
-
position_ids = self.parameter((1, 1), dtype=np.int64)
|
49
|
-
inv_freq = self.constant(inv_freq)
|
50
|
-
|
51
|
-
# embed_tokens module
|
52
|
-
if padding_idx == -1:
|
53
|
-
padding_idx += vocab_size
|
54
|
-
|
55
|
-
axis_node = self.constant(np.array([0], dtype=np.int64))
|
56
|
-
if padding_idx is not None:
|
57
|
-
masked_embeddings = np.ones(weight.shape, dtype=np.float16)
|
58
|
-
masked_embeddings[padding_idx, :] = 0.0 # mask
|
59
|
-
|
60
|
-
node_mask = self.constant(masked_embeddings)
|
61
|
-
node_masked_w = self.eltwise_mul(weight, node_mask)
|
62
|
-
res = self.gather(node_masked_w, input, axis_node, 0)
|
63
|
-
else:
|
64
|
-
res = self.gather(weight, input, axis_node, 0)
|
65
|
-
|
66
|
-
# rotary_emb module
|
67
|
-
inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
|
68
|
-
position_ids = self.reshape(position_ids, (1, 1, 1))
|
69
|
-
freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
|
70
|
-
self.convert_to_fp32(position_ids))
|
71
|
-
freqs = self.transpose(freqs, [0, 2, 1])
|
72
|
-
emb = self.concat(freqs, freqs, axis=2)
|
73
|
-
cos = self.cos(emb)
|
74
|
-
sin = self.sin(emb)
|
75
|
-
cos = cos * self.attention_scaling
|
76
|
-
sin = sin * self.attention_scaling
|
77
|
-
|
78
|
-
# define outputs
|
79
|
-
res = self.convert_to_fp16(res)
|
80
|
-
cos = self.convert_to_fp32(cos)
|
81
|
-
sin = self.convert_to_fp32(sin)
|
82
|
-
|
83
|
-
print("start compiling")
|
84
|
-
self.compile()
|
85
|
-
|
86
|
-
|
87
|
-
class Llama32PostEmbedding(NNFactory):
|
88
|
-
def __init__(
|
89
|
-
self,
|
90
|
-
inv_freq,
|
91
|
-
attention_scaling,
|
92
|
-
input_len: int = 1,
|
93
|
-
device: str = "NPU",
|
94
|
-
):
|
95
|
-
super().__init__(False, device)
|
96
|
-
self.attention_scaling = attention_scaling
|
97
|
-
|
98
|
-
# define input
|
99
|
-
position_ids = self.parameter((1, input_len), dtype=np.int64)
|
100
|
-
inv_freq = self.constant(inv_freq)
|
101
|
-
|
102
|
-
# rotary_emb module
|
103
|
-
inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
|
104
|
-
position_ids = self.reshape(position_ids, (1, 1, input_len))
|
105
|
-
freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
|
106
|
-
self.convert_to_fp32(position_ids))
|
107
|
-
freqs = self.transpose(freqs, [0, 2, 1])
|
108
|
-
emb = self.concat(freqs, freqs, axis=2)
|
109
|
-
cos = self.cos(emb)
|
110
|
-
sin = self.sin(emb)
|
111
|
-
cos = cos * self.attention_scaling
|
112
|
-
sin = sin * self.attention_scaling
|
113
|
-
if input_len > 1:
|
114
|
-
cos = self.unsqueeze(cos, [1])
|
115
|
-
sin = self.unsqueeze(sin, [1])
|
116
|
-
|
117
|
-
# define outputs
|
118
|
-
cos = self.convert_to_fp32(cos)
|
119
|
-
sin = self.convert_to_fp32(sin)
|
120
|
-
|
121
|
-
print("start compiling")
|
122
|
-
self.compile()
|
21
|
+
from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
|
22
|
+
obtain_weight_from_single_layer, obtain_embedding_from_model
|
123
23
|
|
124
24
|
|
125
25
|
def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
@@ -197,62 +97,10 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
|
|
197
97
|
bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
|
198
98
|
weight.tofile(bin_file)
|
199
99
|
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
vocab_size=model.config.vocab_size,
|
205
|
-
embedding_dim=model.config.hidden_size,
|
206
|
-
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
207
|
-
padding_idx=model.config.pad_token_id,
|
208
|
-
dtype=np.float16,
|
209
|
-
)
|
210
|
-
if convert_model:
|
211
|
-
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
212
|
-
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
213
|
-
first_blob_path = None
|
214
|
-
else:
|
215
|
-
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
216
|
-
temp_dir, keep_ir=keep_ir,
|
217
|
-
compile_blob=compile_blob)
|
218
|
-
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
219
|
-
else:
|
220
|
-
# llama-3.2-3B & llama-3.2-1B
|
221
|
-
embedding_layer = model.model.embed_tokens
|
222
|
-
new_embedding = Llama32Embedding(
|
223
|
-
vocab_size=model.config.vocab_size,
|
224
|
-
embedding_dim=model.config.hidden_size,
|
225
|
-
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
226
|
-
padding_idx=model.config.pad_token_id,
|
227
|
-
inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
|
228
|
-
attention_scaling=model.model.rotary_emb.attention_scaling,
|
229
|
-
dtype=np.float16,
|
230
|
-
)
|
231
|
-
if convert_model:
|
232
|
-
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
233
|
-
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
234
|
-
first_blob_path = None
|
235
|
-
# save embedding post module
|
236
|
-
inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
|
237
|
-
attention_scaling = model.model.rotary_emb.attention_scaling
|
238
|
-
embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
|
239
|
-
attention_scaling=attention_scaling,
|
240
|
-
input_len=1)
|
241
|
-
update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
|
242
|
-
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
243
|
-
embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
|
244
|
-
attention_scaling=attention_scaling,
|
245
|
-
input_len=max_prompt_len)
|
246
|
-
update_names_of_IR_and_export_blob(embedding_post_prefill,
|
247
|
-
"embedding_post_prefill",
|
248
|
-
temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
|
249
|
-
os.remove(os.path.join(temp_dir, "embedding_post.bin"))
|
250
|
-
os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
|
251
|
-
else:
|
252
|
-
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
|
253
|
-
temp_dir, keep_ir=keep_ir,
|
254
|
-
compile_blob=compile_blob)
|
255
|
-
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
100
|
+
first_blob_path = obtain_embedding_from_model(model, convert_model,
|
101
|
+
temp_dir, weight_dir,
|
102
|
+
max_prompt_len,
|
103
|
+
keep_ir, compile_blob)
|
256
104
|
|
257
105
|
return first_blob_path, last_blob_path
|
258
106
|
|
@@ -18,13 +18,14 @@
|
|
18
18
|
import torch
|
19
19
|
import numpy as np
|
20
20
|
import os
|
21
|
-
from .common import update_names_of_IR_and_export_blob,
|
22
|
-
obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer
|
21
|
+
from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
|
22
|
+
obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer, \
|
23
|
+
obtain_embedding_from_model
|
23
24
|
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
|
24
25
|
|
25
26
|
|
26
27
|
def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
27
|
-
convert_model=False, group_size=0,
|
28
|
+
convert_model=False, group_size=0, max_prompt_len=1,
|
28
29
|
keep_ir=False, compile_blob=True):
|
29
30
|
num_heads = model.model.layers[0].self_attn.num_heads
|
30
31
|
head_dim = model.model.layers[0].self_attn.head_dim
|
@@ -107,24 +108,10 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
|
107
108
|
bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
|
108
109
|
weight.tofile(bin_file)
|
109
110
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
|
115
|
-
padding_idx=model.config.pad_token_id,
|
116
|
-
dtype=np.float16,
|
117
|
-
input_length=1,
|
118
|
-
)
|
119
|
-
if convert_model:
|
120
|
-
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
|
121
|
-
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
|
122
|
-
first_blob_path = True
|
123
|
-
else:
|
124
|
-
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
|
125
|
-
temp_dir, keep_ir=keep_ir,
|
126
|
-
compile_blob=compile_blob)
|
127
|
-
os.remove(os.path.join(temp_dir, "embedding.bin"))
|
111
|
+
first_blob_path = obtain_embedding_from_model(model, convert_model,
|
112
|
+
temp_dir, weight_dir,
|
113
|
+
max_prompt_len,
|
114
|
+
keep_ir, compile_blob)
|
128
115
|
return first_blob_path, last_blob_path
|
129
116
|
|
130
117
|
|
@@ -145,8 +132,13 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
145
132
|
mlp_layer = curr_layer.mlp
|
146
133
|
weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
|
147
134
|
q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer)
|
148
|
-
|
149
|
-
|
135
|
+
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
136
|
+
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
|
137
|
+
cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
|
138
|
+
else:
|
139
|
+
# transformers >= 4.45.0
|
140
|
+
cached_cos = None
|
141
|
+
cached_sin = None
|
150
142
|
layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
|
151
143
|
layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
|
152
144
|
|
@@ -158,10 +150,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
158
150
|
if mode == "decode":
|
159
151
|
input_len = 1
|
160
152
|
decoder_name = f"decoder_layer_{layer_idx}"
|
153
|
+
keep_position_ids = True
|
161
154
|
npu_dpu_groups = None
|
162
155
|
else:
|
163
156
|
input_len = kv_len
|
164
157
|
decoder_name = "decoder_layer_prefill"
|
158
|
+
keep_position_ids = False
|
165
159
|
npu_dpu_groups = 6
|
166
160
|
|
167
161
|
single_decoder = LowBitQwenMultiDecoderlayer(
|
@@ -185,6 +179,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
185
179
|
n_splits_linear=n_splits_linear,
|
186
180
|
n_splits_down_proj=n_splits_down_proj,
|
187
181
|
group_size=group_size,
|
182
|
+
cos_len=input_len,
|
183
|
+
keep_position_ids=keep_position_ids,
|
188
184
|
asym=asym
|
189
185
|
)
|
190
186
|
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
|
@@ -196,14 +192,25 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
|
|
196
192
|
|
197
193
|
# 0, 1, 2 are input_embed/attention_mask/position_id
|
198
194
|
if mode == "decode":
|
199
|
-
if
|
200
|
-
|
195
|
+
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
196
|
+
if layernorm_const:
|
197
|
+
st_idx = 3
|
198
|
+
else:
|
199
|
+
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
|
200
|
+
post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
|
201
|
+
layer_norm_0.data.numpy().tofile(input_lm_bin_file)
|
202
|
+
layer_norm_1.data.numpy().tofile(post_lm_bin_file)
|
203
|
+
st_idx = 5
|
201
204
|
else:
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
205
|
+
# transformers >= 4.45.0
|
206
|
+
if layernorm_const:
|
207
|
+
st_idx = 4
|
208
|
+
else:
|
209
|
+
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
|
210
|
+
post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_5.bin")
|
211
|
+
layer_norm_0.data.numpy().tofile(input_lm_bin_file)
|
212
|
+
layer_norm_1.data.numpy().tofile(post_lm_bin_file)
|
213
|
+
st_idx = 6
|
207
214
|
q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
|
208
215
|
k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
|
209
216
|
v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
|
@@ -261,8 +268,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
|
|
261
268
|
attn_layer = curr_layer.self_attn
|
262
269
|
mlp_layer = curr_layer.mlp
|
263
270
|
weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
|
264
|
-
|
265
|
-
|
271
|
+
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
|
272
|
+
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
|
273
|
+
cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
|
274
|
+
else:
|
275
|
+
# transformers >= 4.45.0
|
276
|
+
cached_cos = None
|
277
|
+
cached_sin = None
|
266
278
|
layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
|
267
279
|
layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
|
268
280
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ipex-llm
|
3
|
-
Version: 2.2.
|
3
|
+
Version: 2.2.0b20250207
|
4
4
|
Summary: Large Language Model Develop Toolkit
|
5
5
|
Home-page: https://github.com/intel-analytics/ipex-llm
|
6
6
|
Author: BigDL Authors
|
@@ -27,19 +27,12 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
|
|
27
27
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
|
28
28
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
|
29
29
|
Provides-Extra: cpp
|
30
|
-
Requires-Dist: bigdl-core-cpp ==2.6.
|
30
|
+
Requires-Dist: bigdl-core-cpp ==2.6.0b20250207 ; extra == 'cpp'
|
31
31
|
Requires-Dist: setuptools ; extra == 'cpp'
|
32
|
-
|
33
|
-
Requires-Dist:
|
34
|
-
Requires-Dist:
|
35
|
-
Requires-Dist:
|
36
|
-
Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
37
|
-
Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
38
|
-
Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
39
|
-
Requires-Dist: onednn-devel ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
|
40
|
-
Requires-Dist: onednn ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
|
41
|
-
Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
|
42
|
-
Requires-Dist: mkl-dpcpp ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp'
|
32
|
+
Requires-Dist: onednn-devel ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
|
33
|
+
Requires-Dist: onednn ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
|
34
|
+
Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; (platform_system == "Windows") and extra == 'cpp'
|
35
|
+
Requires-Dist: mkl-dpcpp ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
|
43
36
|
Provides-Extra: llama-index
|
44
37
|
Requires-Dist: py-cpuinfo ; extra == 'llama-index'
|
45
38
|
Requires-Dist: protobuf ; extra == 'llama-index'
|
@@ -67,7 +60,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
|
|
67
60
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
|
68
61
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
|
69
62
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
|
70
|
-
Requires-Dist: bigdl-core-npu ==2.6.
|
63
|
+
Requires-Dist: bigdl-core-npu ==2.6.0b20250207 ; (platform_system == "Windows") and extra == 'npu'
|
71
64
|
Provides-Extra: serving
|
72
65
|
Requires-Dist: py-cpuinfo ; extra == 'serving'
|
73
66
|
Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
|
@@ -87,9 +80,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
|
|
87
80
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
|
88
81
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
|
89
82
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
|
90
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
91
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
92
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
83
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250207 ; extra == 'xpu'
|
84
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250207 ; extra == 'xpu'
|
85
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250207 ; extra == 'xpu'
|
93
86
|
Provides-Extra: xpu-2-1
|
94
87
|
Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
|
95
88
|
Requires-Dist: protobuf ; extra == 'xpu-2-1'
|
@@ -104,9 +97,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
|
|
104
97
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
|
105
98
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
|
106
99
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
|
107
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
108
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
109
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
100
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250207 ; extra == 'xpu-2-1'
|
101
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250207 ; extra == 'xpu-2-1'
|
102
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250207 ; extra == 'xpu-2-1'
|
110
103
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
|
111
104
|
Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
112
105
|
Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
@@ -124,7 +117,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
|
|
124
117
|
Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
|
125
118
|
Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
|
126
119
|
Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
|
127
|
-
Requires-Dist: bigdl-core-xe-all ==2.6.
|
120
|
+
Requires-Dist: bigdl-core-xe-all ==2.6.0b20250207 ; extra == 'xpu-2-6'
|
128
121
|
Requires-Dist: onednn-devel ==2025.0.1 ; extra == 'xpu-2-6'
|
129
122
|
Requires-Dist: onednn ==2025.0.1 ; extra == 'xpu-2-6'
|
130
123
|
Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; extra == 'xpu-2-6'
|
@@ -140,9 +133,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
|
|
140
133
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
|
141
134
|
Requires-Dist: tabulate ; extra == 'xpu-arc'
|
142
135
|
Requires-Dist: setuptools ; extra == 'xpu-arc'
|
143
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
144
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
145
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
136
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250207 ; extra == 'xpu-arc'
|
137
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250207 ; extra == 'xpu-arc'
|
138
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250207 ; extra == 'xpu-arc'
|
146
139
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
|
147
140
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
148
141
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
@@ -163,9 +156,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
|
|
163
156
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
|
164
157
|
Requires-Dist: tabulate ; extra == 'xpu-arl'
|
165
158
|
Requires-Dist: setuptools ; extra == 'xpu-arl'
|
166
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
167
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
168
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
159
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250207 ; extra == 'xpu-arl'
|
160
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250207 ; extra == 'xpu-arl'
|
161
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250207 ; extra == 'xpu-arl'
|
169
162
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
|
170
163
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
171
164
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
@@ -186,9 +179,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
|
|
186
179
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
|
187
180
|
Requires-Dist: tabulate ; extra == 'xpu-lnl'
|
188
181
|
Requires-Dist: setuptools ; extra == 'xpu-lnl'
|
189
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
190
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
191
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
182
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250207 ; extra == 'xpu-lnl'
|
183
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250207 ; extra == 'xpu-lnl'
|
184
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250207 ; extra == 'xpu-lnl'
|
192
185
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
|
193
186
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|
194
187
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|
@@ -41,35 +41,35 @@ ipex_llm/langchain/llms/transformerspipelinellm.py,sha256=vm522YPPwWxxAPVvQBtxRf
|
|
41
41
|
ipex_llm/langchain/vllm/__init__.py,sha256=T-EbRT6GJ_8RCu-iLmSzcftOimXSPQf2d5X72AUAy2Y,874
|
42
42
|
ipex_llm/langchain/vllm/vllm.py,sha256=6dxc-ZISZQrJilEa_HA827l75Dv9rcHpY_G6FdJ8BVs,7793
|
43
43
|
ipex_llm/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
|
-
ipex_llm/libs/bloom-api.dll,sha256=
|
45
|
-
ipex_llm/libs/bloom.dll,sha256=
|
46
|
-
ipex_llm/libs/gptneox-api.dll,sha256=
|
47
|
-
ipex_llm/libs/gptneox.dll,sha256=
|
48
|
-
ipex_llm/libs/libbloom_avx.dll,sha256=
|
49
|
-
ipex_llm/libs/libbloom_vnni.dll,sha256=
|
50
|
-
ipex_llm/libs/libgptneox_avx.dll,sha256=
|
51
|
-
ipex_llm/libs/libgptneox_vnni.dll,sha256=
|
52
|
-
ipex_llm/libs/libllama_avx.dll,sha256=
|
53
|
-
ipex_llm/libs/libllama_vnni.dll,sha256=
|
54
|
-
ipex_llm/libs/libstarcoder_avx.dll,sha256=
|
55
|
-
ipex_llm/libs/libstarcoder_vnni.dll,sha256=
|
56
|
-
ipex_llm/libs/llama-api.dll,sha256=
|
57
|
-
ipex_llm/libs/llama.dll,sha256=
|
58
|
-
ipex_llm/libs/main-bloom.exe,sha256
|
59
|
-
ipex_llm/libs/main-gptneox.exe,sha256=
|
60
|
-
ipex_llm/libs/main-llama.exe,sha256=
|
61
|
-
ipex_llm/libs/main-starcoder.exe,sha256=
|
62
|
-
ipex_llm/libs/pipeline.dll,sha256=
|
63
|
-
ipex_llm/libs/quantize-bloom.exe,sha256=
|
64
|
-
ipex_llm/libs/quantize-bloom_vnni.exe,sha256=
|
65
|
-
ipex_llm/libs/quantize-gptneox.exe,sha256=
|
66
|
-
ipex_llm/libs/quantize-gptneox_vnni.exe,sha256=
|
67
|
-
ipex_llm/libs/quantize-llama.exe,sha256=
|
68
|
-
ipex_llm/libs/quantize-llama_vnni.exe,sha256=
|
69
|
-
ipex_llm/libs/quantize-starcoder.exe,sha256=
|
70
|
-
ipex_llm/libs/quantize-starcoder_vnni.exe,sha256=
|
71
|
-
ipex_llm/libs/starcoder-api.dll,sha256=
|
72
|
-
ipex_llm/libs/starcoder.dll,sha256=
|
44
|
+
ipex_llm/libs/bloom-api.dll,sha256=R0zcv1M0D8y8inrrCUO2xCSTRb0IChVyLa6YQo9zne8,36352
|
45
|
+
ipex_llm/libs/bloom.dll,sha256=eBzUhLMeOAb9InMPp9_KC5VhJC9F-YKNlJn6HyfOAb0,507904
|
46
|
+
ipex_llm/libs/gptneox-api.dll,sha256=9_mq8IntnMiU7-_kDxiLojnEc1nu3rrxZZAIes7Nd4k,24576
|
47
|
+
ipex_llm/libs/gptneox.dll,sha256=kR3dyhN7tNUxVIWoqudW57V0MIGqr-Mxkmw7kwR8VWs,568320
|
48
|
+
ipex_llm/libs/libbloom_avx.dll,sha256=0iRHd_QIzEG_NI0RkFKmCX_HG-3E21t33sxrmbCpQwo,536576
|
49
|
+
ipex_llm/libs/libbloom_vnni.dll,sha256=dL1TzKzoki8KDsCmka6QfzBH24T06WokxT3F4M5a3lk,508416
|
50
|
+
ipex_llm/libs/libgptneox_avx.dll,sha256=SPi9xXxB5jLp63CfgVhmMA-rCoyCCji2nuWz-rv5y3E,596992
|
51
|
+
ipex_llm/libs/libgptneox_vnni.dll,sha256=NV3xykgHJGxNTDWAA_yhwlBG_dbHPX0__5s9uHCPmfc,568832
|
52
|
+
ipex_llm/libs/libllama_avx.dll,sha256=EbZ-lpHHtM-zS9aiuDU8cBVueVAtRi3UqerARH41qC8,591360
|
53
|
+
ipex_llm/libs/libllama_vnni.dll,sha256=67XqNSyXI1nuaA1-xcSOhYIHZaH7aZBvwMetGpTriIk,563200
|
54
|
+
ipex_llm/libs/libstarcoder_avx.dll,sha256=kAqXHfoZfmyqIbNbGpzQjXNCMz9pkG5KVRECzEDEwhM,627712
|
55
|
+
ipex_llm/libs/libstarcoder_vnni.dll,sha256=c02B9jpBvST282jRXJtkRwJKkZnzhkz5MLdFfjH9T8I,599552
|
56
|
+
ipex_llm/libs/llama-api.dll,sha256=SA2frHXocsnAN9z3LZfWT_FjY1waSMS26bHM6ot_07c,25600
|
57
|
+
ipex_llm/libs/llama.dll,sha256=Ls7CKimo2SNy-uJt6lLz16yz1O9E358dRgP8E0svF98,562688
|
58
|
+
ipex_llm/libs/main-bloom.exe,sha256=-HCik31DRGrozp_Uy420O1l-Sk_7e9V1bjg4XaLPFvA,103424
|
59
|
+
ipex_llm/libs/main-gptneox.exe,sha256=pqxQCGKBrsoDtvuKhCwk6uOAGt4GGvzoAdQbHB9qrFI,98816
|
60
|
+
ipex_llm/libs/main-llama.exe,sha256=sPKj3WRmI97jyNhO4A5Lz4eF-tsZZojv6z2VaNzAKAU,99840
|
61
|
+
ipex_llm/libs/main-starcoder.exe,sha256=7vyW8v2qO1J_fkRq4uzk44UsV4AhDGmcWHUwMiez8WY,157696
|
62
|
+
ipex_llm/libs/pipeline.dll,sha256=vHFtLO6vUZQVwtzXICv1Q5Ork32Dw5Ipqa8pbr6TtmM,72704
|
63
|
+
ipex_llm/libs/quantize-bloom.exe,sha256=8rUxXU7Z4AZ7mFHI3sGpwGG18_DkapunwTzzUTjCCbo,126464
|
64
|
+
ipex_llm/libs/quantize-bloom_vnni.exe,sha256=gA9kKUkmFOIzT_CmFFvG-fG6d6bZuEWSTeyPvhCsDLs,128000
|
65
|
+
ipex_llm/libs/quantize-gptneox.exe,sha256=YsrviyLjQU9uxD1p6TfdBAPXG72-QzZFGpt7lDmK_gM,104448
|
66
|
+
ipex_llm/libs/quantize-gptneox_vnni.exe,sha256=mYmUHza3rZztjTogXv9FxuIM20z0gHfyjbF6b6ADEK0,104960
|
67
|
+
ipex_llm/libs/quantize-llama.exe,sha256=h-7nbo0uIswViTdxf_vHmE3sZdnQ79dDMUHzqjtyMKs,110080
|
68
|
+
ipex_llm/libs/quantize-llama_vnni.exe,sha256=OEPzGySIaa-O9IhPY-u2slHnhMDzp6mL8e_Qr2WUgKc,110592
|
69
|
+
ipex_llm/libs/quantize-starcoder.exe,sha256=4U-jT0MC4Iz4kP_6WpKkMOSk_hTlqAwgSVlGLGa-imA,127488
|
70
|
+
ipex_llm/libs/quantize-starcoder_vnni.exe,sha256=Xc4jW9KH_RNSfJIYJinDRIx-BbWmqxx4h-kc9jowZpk,128512
|
71
|
+
ipex_llm/libs/starcoder-api.dll,sha256=2lF73SE1AyICwtpQSZUfkiAbE1WJQ5gEbikL1Lsvzhg,21504
|
72
|
+
ipex_llm/libs/starcoder.dll,sha256=NBh51OQS90ppaqMAJAFCa6HptcUnnPx7tUL1J95QwMk,599040
|
73
73
|
ipex_llm/llamaindex/__init__.py,sha256=T-EbRT6GJ_8RCu-iLmSzcftOimXSPQf2d5X72AUAy2Y,874
|
74
74
|
ipex_llm/llamaindex/llms/__init__.py,sha256=KP1lEdGqDuxPoxL1ZSH25Pm2kKMPJBWUTLR0ckSLMIU,1139
|
75
75
|
ipex_llm/llamaindex/llms/bigdlllm.py,sha256=FQBzq1KOjfc6uofTXAha3O7TqpJkNfOFepXQmOVlbnI,26314
|
@@ -95,9 +95,9 @@ ipex_llm/transformers/lisa.py,sha256=F5WxbtXQ7RdKulj83h_2DnEIgKiKGZf7zvOmg6QBl2s
|
|
95
95
|
ipex_llm/transformers/loader.py,sha256=AwjV5RpI2t2bedlv7ZhLm8cfd-QJZm5hny-XyjIvdnk,6876
|
96
96
|
ipex_llm/transformers/lookup.py,sha256=b6OlZ9OV10R9qeWw8mVryVpDxszkjwLkldvi7GPMJY8,19614
|
97
97
|
ipex_llm/transformers/low_bit_linear.py,sha256=3EtbiCAq5HU_r2pGJ9beSDK4NPTN8Jj-aHMqm1jqX18,39177
|
98
|
-
ipex_llm/transformers/model.py,sha256=
|
98
|
+
ipex_llm/transformers/model.py,sha256=FyHrEQhkHxG3FbGkhTjVOP2rgFMjc3AXcjDwvvB0HqU,40798
|
99
99
|
ipex_llm/transformers/modelling_bigdl.py,sha256=7JpNVMuyq_OmtNUaMFMXdxPWZp2q0QHC02QeA-VTPOw,6709
|
100
|
-
ipex_llm/transformers/npu_model.py,sha256=
|
100
|
+
ipex_llm/transformers/npu_model.py,sha256=zgXOiLIJ-3p-1Kejgv4jUFK8OiBZbezMZrRyn0_6_8c,40306
|
101
101
|
ipex_llm/transformers/patches.py,sha256=G9KcXxo42H1HJEDaroq4JbBN5P0P0lty7U7kk7-g4tw,991
|
102
102
|
ipex_llm/transformers/pipeline_parallel.py,sha256=uNZpOXljNmdoEYnP8U-VFiN4dRZb2piQbIf2bG9LQnE,49051
|
103
103
|
ipex_llm/transformers/qlora.py,sha256=qV9Y6G5kAaet77LLA3oXn3qQY4ayyAPZ7NAjOlHCS7g,14967
|
@@ -202,17 +202,17 @@ ipex_llm/transformers/npu_models/paraformer_mp.py,sha256=lGEjmKHW_Pk3BE3nqa1ZVgJ
|
|
202
202
|
ipex_llm/transformers/npu_models/phi3.py,sha256=R-EuqHsTrPTX33HtCGAMFlRdXB_j5mH_7FDnj62JtNM,6555
|
203
203
|
ipex_llm/transformers/npu_models/phi3_v.py,sha256=EMZuTPkGfuDVp9c5BU1HyzXHWKswHRQ8bvQjzocIyHA,7737
|
204
204
|
ipex_llm/transformers/npu_models/qwen2.py,sha256=RDNtPK8kxMk3z8A4S53saTrw2klgkzo4oa7voJLwr1o,12085
|
205
|
-
ipex_llm/transformers/npu_models/qwen2_mp.py,sha256=
|
205
|
+
ipex_llm/transformers/npu_models/qwen2_mp.py,sha256=EKiI80rnQ43WUF_2wWCy75mx-rbjAbRQSB49OgjZFNo,45003
|
206
206
|
ipex_llm/transformers/npu_models/stablelm.py,sha256=0iUhdjFqFd0svuTd09wP60mbEtobPkNSj-1I1vfuhsU,7778
|
207
207
|
ipex_llm/transformers/npu_models/xlm_mp.py,sha256=sj8OVun8xJprM7ZJp0XzWa55rqlSIzNMdKmI9i6jlDU,28332
|
208
208
|
ipex_llm/transformers/npu_pipeline_model/__init__.py,sha256=b2IXvVqQ5cItki021h8s3ymW12RPu8QNPprq4Mn3bDM,586
|
209
209
|
ipex_llm/transformers/npu_pipeline_model/baichuan.py,sha256=ICxRzFQ4OIANDkkVi2_4xOeQXmfFXYMx3H52KuE1xR4,6208
|
210
|
-
ipex_llm/transformers/npu_pipeline_model/common.py,sha256=
|
211
|
-
ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256
|
212
|
-
ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=
|
210
|
+
ipex_llm/transformers/npu_pipeline_model/common.py,sha256=faooJmM75qnVyZYuQLx9gJpVlotcVF4qXRCnOrknfk4,14776
|
211
|
+
ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256=_l4RFmyBMbREo8vzKpHXAMtE202JVQ41Y2lPg1qCOMI,29846
|
212
|
+
ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=j2sipfFSrzV2VgLKPOClMHwWIDXqDsL1jIQJK25hneo,14397
|
213
213
|
ipex_llm/transformers/npu_pipeline_model/minicpm.py,sha256=H7j_UaHj-IwEBriQ-bunle0-8s2NmvqnL9eYuixnmFc,21398
|
214
214
|
ipex_llm/transformers/npu_pipeline_model/pipeline_cpp.py,sha256=JNmodAMg_NQvDILug3E_fGXEh6cd3wsj4bvAzcd-vaU,2749
|
215
|
-
ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=
|
215
|
+
ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=6MNtCL1CXoR19B4tKZSgv2e5gtma9bqDG7DOYMCnPt0,16013
|
216
216
|
ipex_llm/utils/__init__.py,sha256=LlUgrD03rfw4iY8zWPtHH6p65Gw76waVOLHaqagETw0,1425
|
217
217
|
ipex_llm/utils/benchmark_util_4_29.py,sha256=OU1W1quiaiJGsg1pd3HM9O6PmVSaPA0HHE7R8hNTfmQ,258653
|
218
218
|
ipex_llm/utils/benchmark_util_4_42.py,sha256=HEiClCgKDp_T64HH8ulSTly8dvt6UwPDYZfrPVYvXcc,225383
|
@@ -248,11 +248,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
|
|
248
248
|
ipex_llm/vllm/xpu/engine/engine.py,sha256=k4-D27WS_Gk3mA--w3HWAjPjb4Aiu043MVPi0ZoAUBc,5984
|
249
249
|
ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=GshTZFB8e4PWvqckfbmTOU6b0oLkNn7A-vzLuG9--j8,21544
|
250
250
|
ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=2rENA2ucynMaIjiZBEh2ez1o5vR32GaP514t39CD7KM,8676
|
251
|
-
ipex_llm-2.2.
|
252
|
-
ipex_llm-2.2.
|
253
|
-
ipex_llm-2.2.
|
254
|
-
ipex_llm-2.2.
|
255
|
-
ipex_llm-2.2.
|
256
|
-
ipex_llm-2.2.
|
257
|
-
ipex_llm-2.2.
|
258
|
-
ipex_llm-2.2.
|
251
|
+
ipex_llm-2.2.0b20250207.data/scripts/ipex-llm-init.bat,sha256=HPtCYuDYwEatq7dAwOvdfVcHYCpAVdbj75K1qh0vQek,2578
|
252
|
+
ipex_llm-2.2.0b20250207.data/scripts/llm-chat.ps1,sha256=6qrs-hGVAV8IKh7Jx8nq_XrnZcjd7qGU5wndArM7Yag,2769
|
253
|
+
ipex_llm-2.2.0b20250207.data/scripts/llm-cli.ps1,sha256=3qBtTLs_EjYDnM8YyCpJhzLnGCKTEGssu9UNqfkjVXs,3009
|
254
|
+
ipex_llm-2.2.0b20250207.dist-info/METADATA,sha256=d1hx5hE5Xeb3lHGWqeF35SK9GZOX6syXJ_Syu5b35IU,12369
|
255
|
+
ipex_llm-2.2.0b20250207.dist-info/WHEEL,sha256=6iYPr8vTHsyDK75jr9X0V3I9wPSVmtwr_8fdATBciGk,98
|
256
|
+
ipex_llm-2.2.0b20250207.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
|
257
|
+
ipex_llm-2.2.0b20250207.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
|
258
|
+
ipex_llm-2.2.0b20250207.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|