ipex-llm 2.2.0b20250108__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250109__py3-none-manylinux2010_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/transformers/convert.py +5 -13
- ipex_llm/transformers/models/baichuan.py +7 -36
- ipex_llm/transformers/models/bert.py +2 -13
- ipex_llm/transformers/models/chatglm2.py +8 -31
- ipex_llm/transformers/models/chatglm4.py +9 -4
- ipex_llm/transformers/models/chatglm4v.py +1 -1
- ipex_llm/transformers/models/common.py +3 -1
- ipex_llm/transformers/models/glm.py +1 -1
- ipex_llm/transformers/models/internlm.py +6 -3
- ipex_llm/transformers/models/llama.py +1 -1
- ipex_llm/transformers/models/minicpm.py +1 -1
- ipex_llm/transformers/models/minicpm3.py +3 -1
- ipex_llm/transformers/models/mistral.py +1 -1
- ipex_llm/transformers/models/mllama.py +1 -1
- ipex_llm/transformers/models/phi3.py +6 -2
- ipex_llm/transformers/models/qwen.py +4 -2
- ipex_llm/transformers/models/qwen2.py +4 -3
- ipex_llm/transformers/models/qwen2_moe.py +4 -2
- ipex_llm/transformers/models/qwen2_vl.py +3 -1
- ipex_llm/transformers/models/stablelm.py +3 -1
- ipex_llm/transformers/models/starcoder2.py +3 -1
- ipex_llm/transformers/models/utils.py +7 -3
- ipex_llm/transformers/models/yuan.py +2 -1
- {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/METADATA +20 -20
- {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/RECORD +31 -31
- {ipex_llm-2.2.0b20250108.data → ipex_llm-2.2.0b20250109.data}/scripts/ipex-llm-init +0 -0
- {ipex_llm-2.2.0b20250108.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-chat +0 -0
- {ipex_llm-2.2.0b20250108.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-cli +0 -0
- {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/top_level.txt +0 -0
ipex_llm/transformers/convert.py
CHANGED
@@ -1325,7 +1325,6 @@ def _optimize_post(model):
|
|
1325
1325
|
modeling_module_name = model.__class__.__module__
|
1326
1326
|
module = importlib.import_module(modeling_module_name)
|
1327
1327
|
from ipex_llm.transformers.models.chatglm2 import chatglm2_attention_forward
|
1328
|
-
from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
|
1329
1328
|
from ipex_llm.transformers.models.chatglm2 import chatglm2_encoder_forward
|
1330
1329
|
from ipex_llm.transformers.models.chatglm2 import chatglm2_model_forward
|
1331
1330
|
from ipex_llm.transformers.models.chatglm2 import mlp_forward
|
@@ -1338,9 +1337,7 @@ def _optimize_post(model):
|
|
1338
1337
|
convert_forward(model,
|
1339
1338
|
module.ChatGLMModel,
|
1340
1339
|
chatglm2_model_forward)
|
1341
|
-
convert_forward(model,
|
1342
|
-
module.RMSNorm,
|
1343
|
-
chatglm_rms_norm_forward)
|
1340
|
+
convert_forward(model, module.RMSNorm, rms_norm_forward)
|
1344
1341
|
convert_forward(model, module.MLP, mlp_forward)
|
1345
1342
|
# for codegeex-nano
|
1346
1343
|
if hasattr(model.config, "rope_ratio"):
|
@@ -1358,8 +1355,7 @@ def _optimize_post(model):
|
|
1358
1355
|
# glm4 family
|
1359
1356
|
modeling_module_name = model.__class__.__module__
|
1360
1357
|
module = importlib.import_module(modeling_module_name)
|
1361
|
-
|
1362
|
-
convert_forward(model, module.RMSNorm, chatglm_rms_norm_forward)
|
1358
|
+
convert_forward(model, module.RMSNorm, rms_norm_forward)
|
1363
1359
|
|
1364
1360
|
if hasattr(model.transformer, "vision"):
|
1365
1361
|
# glm4 vision family
|
@@ -1448,8 +1444,8 @@ def _optimize_post(model):
|
|
1448
1444
|
elif model.config.model_type == "baichuan":
|
1449
1445
|
modeling_module_name = model.__class__.__module__
|
1450
1446
|
module = importlib.import_module(modeling_module_name)
|
1451
|
-
|
1452
|
-
convert_forward(model, module.MLP,
|
1447
|
+
convert_forward(model, module.RMSNorm, rms_norm_forward)
|
1448
|
+
convert_forward(model, module.MLP, mlp_silu_forward)
|
1453
1449
|
|
1454
1450
|
if model.config.hidden_size in [4096, 2048]:
|
1455
1451
|
# baichuan-7B and baichuan2-7B
|
@@ -1458,7 +1454,6 @@ def _optimize_post(model):
|
|
1458
1454
|
for i in range(len(model.model.layers)):
|
1459
1455
|
setattr(model.model.layers[i].self_attn, "layer_idx", i)
|
1460
1456
|
convert_forward(model, module.Attention, baichuan_attention_forward_7b)
|
1461
|
-
convert_forward(model, module.RMSNorm, rms_norm_forward)
|
1462
1457
|
if model.config.vocab_size == 125696:
|
1463
1458
|
# baichuan2-7B
|
1464
1459
|
convert_forward(model, module.BaichuanModel, baichuan_model_7b_forward)
|
@@ -1468,9 +1463,7 @@ def _optimize_post(model):
|
|
1468
1463
|
elif model.config.hidden_size == 5120:
|
1469
1464
|
# baichuan-13B and baichuan2-13B
|
1470
1465
|
from ipex_llm.transformers.models.baichuan import baichuan_attention_forward_13b
|
1471
|
-
from ipex_llm.transformers.models.baichuan import baichuan_13b_rms_norm_forward
|
1472
1466
|
convert_forward(model, module.BaichuanAttention, baichuan_attention_forward_13b)
|
1473
|
-
convert_forward(model, module.RMSNorm, baichuan_13b_rms_norm_forward)
|
1474
1467
|
|
1475
1468
|
if model.config.vocab_size == 125696:
|
1476
1469
|
# baichaun2-13B
|
@@ -1565,7 +1558,6 @@ def _optimize_post(model):
|
|
1565
1558
|
from ipex_llm.transformers.models.qwen import qwen_attention_forward
|
1566
1559
|
from ipex_llm.transformers.models.qwen import qwen_attention_forward_registered
|
1567
1560
|
from ipex_llm.transformers.models.qwen import qwen_mlp_forward
|
1568
|
-
from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
|
1569
1561
|
from ipex_llm.transformers.models.qwen import qwen_model_forward
|
1570
1562
|
if model.config.max_position_embeddings == 8192 \
|
1571
1563
|
and model.config.hidden_size == 4096:
|
@@ -1580,7 +1572,7 @@ def _optimize_post(model):
|
|
1580
1572
|
)
|
1581
1573
|
convert_forward(model,
|
1582
1574
|
module.RMSNorm,
|
1583
|
-
|
1575
|
+
rms_norm_forward)
|
1584
1576
|
convert_forward(model,
|
1585
1577
|
module.QWenMLP,
|
1586
1578
|
qwen_mlp_forward)
|
@@ -47,38 +47,6 @@ def pre_compute_inv_freq(module: torch.nn.Module):
|
|
47
47
|
module.register_buffer("inv_freq", inv_freq, persistent=False)
|
48
48
|
|
49
49
|
|
50
|
-
def baichuan_13b_rms_norm_forward(self, hidden_states):
|
51
|
-
if hidden_states.device.type == "xpu" and not (self.training or hidden_states.requires_grad):
|
52
|
-
import xe_addons
|
53
|
-
x_2d = hidden_states.reshape(-1, hidden_states.size(-1)).contiguous()
|
54
|
-
output = xe_addons.rms_norm(self.weight, x_2d, self.epsilon)
|
55
|
-
return output.reshape(hidden_states.shape)
|
56
|
-
|
57
|
-
input_dtype = hidden_states.dtype
|
58
|
-
hidden_states = hidden_states.to(torch.float32)
|
59
|
-
variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
60
|
-
hidden_states = hidden_states * torch.rsqrt(variance + self.epsilon)
|
61
|
-
return self.weight * hidden_states.to(input_dtype)
|
62
|
-
|
63
|
-
|
64
|
-
def baichuan_mlp_forward(
|
65
|
-
self,
|
66
|
-
x: torch.Tensor,
|
67
|
-
) -> torch.Tensor:
|
68
|
-
x_2d = x.view(-1, x.shape[-1])
|
69
|
-
qtype = getattr(self.gate_proj, "qtype", None)
|
70
|
-
if mlp_fusion_check(x_2d, qtype, self.training):
|
71
|
-
import xe_linear
|
72
|
-
if not x_2d.is_contiguous():
|
73
|
-
x_2d = x_2d.contiguous()
|
74
|
-
return self.down_proj(xe_linear.mlp_forward_xpu(
|
75
|
-
x_2d, self.gate_proj.weight.data, self.up_proj.weight.data,
|
76
|
-
x_2d.shape[0], x_2d.shape[1], self.gate_proj.out_len,
|
77
|
-
SILU, qtype
|
78
|
-
))
|
79
|
-
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
80
|
-
|
81
|
-
|
82
50
|
def baichuan_model_7b_forward(
|
83
51
|
self,
|
84
52
|
input_ids: torch.LongTensor = None,
|
@@ -105,7 +73,9 @@ def baichuan_model_7b_forward(
|
|
105
73
|
if use_cache:
|
106
74
|
inputs = input_ids if input_ids is not None else inputs_embeds
|
107
75
|
use_compress_kv = should_use_compresskv(inputs, inputs.shape[1])
|
108
|
-
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs
|
76
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
|
77
|
+
self.config.num_attention_heads,
|
78
|
+
self.config.num_attention_heads)
|
109
79
|
if use_compress_kv and not isinstance(past_key_values,
|
110
80
|
DynamicCompressCache):
|
111
81
|
if use_quantize_kv:
|
@@ -278,8 +248,6 @@ def baichuan_attention_forward_7b(
|
|
278
248
|
key_states = key_states.to(hidden_states.dtype)
|
279
249
|
|
280
250
|
# IPEX-LLM OPT: kv cache and quantize kv
|
281
|
-
use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states)
|
282
|
-
|
283
251
|
# [CompressKV]
|
284
252
|
if use_compresskv:
|
285
253
|
enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value,
|
@@ -290,6 +258,8 @@ def baichuan_attention_forward_7b(
|
|
290
258
|
query_states, attention_mask, 1,
|
291
259
|
self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH)
|
292
260
|
else:
|
261
|
+
use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states,
|
262
|
+
self.num_heads, self.num_heads)
|
293
263
|
key_states, value_states = update_past_key_value(
|
294
264
|
past_key_value, key_states, value_states,
|
295
265
|
kv_seq_len, use_quantize_kv, device
|
@@ -340,7 +310,8 @@ def baichuan_attention_forward_13b(
|
|
340
310
|
kv_seq_len += past_key_value[0].shape[2]
|
341
311
|
|
342
312
|
# IPEX-LLM OPT: kv cache and quantize kv
|
343
|
-
use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states
|
313
|
+
use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states,
|
314
|
+
self.num_heads, self.num_heads)
|
344
315
|
key_states, value_states = update_past_key_value(
|
345
316
|
past_key_value, key_states, value_states,
|
346
317
|
kv_seq_len, use_quantize_kv, device
|
@@ -36,24 +36,13 @@ import math
|
|
36
36
|
import torch
|
37
37
|
from typing import Optional, Tuple
|
38
38
|
from transformers.models.bert.modeling_bert import BertSelfAttention, BertEncoder
|
39
|
+
from ipex_llm.transformers.models.common import merge_linear
|
39
40
|
from ipex_llm.utils.common import invalidInputError
|
40
41
|
|
41
42
|
|
42
43
|
def merge_qkv(module: torch.nn.Module):
|
43
44
|
if isinstance(module, BertSelfAttention):
|
44
|
-
|
45
|
-
k_w = module.key.weight.data
|
46
|
-
v_w = module.value.weight.data
|
47
|
-
q_b = module.query.bias.data
|
48
|
-
k_b = module.key.bias.data
|
49
|
-
v_b = module.value.bias.data
|
50
|
-
new_w = torch.cat([q_w, k_w, v_w], dim=0)
|
51
|
-
new_b = torch.cat([q_b, k_b, v_b], dim=-1)
|
52
|
-
qkv = torch.nn.Linear(0, 0, bias=True)
|
53
|
-
qkv.weight = torch.nn.Parameter(new_w, requires_grad=False)
|
54
|
-
qkv.bias = torch.nn.Parameter(new_b, requires_grad=False)
|
55
|
-
qkv.in_features = module.query.in_features
|
56
|
-
qkv.out_features = module.query.out_features * 3
|
45
|
+
qkv = merge_linear([module.query, module.key, module.value])
|
57
46
|
module.qkv = qkv
|
58
47
|
del module.query
|
59
48
|
del module.key
|
@@ -33,34 +33,6 @@ from ipex_llm.transformers.kv import DynamicCompressCache, DynamicCompressFp8Cac
|
|
33
33
|
KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256))
|
34
34
|
|
35
35
|
|
36
|
-
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
37
|
-
"""
|
38
|
-
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states
|
39
|
-
go from (batch, num_key_value_heads, seqlen, head_dim) to
|
40
|
-
(batch, num_attention_heads, seqlen, head_dim)
|
41
|
-
"""
|
42
|
-
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
|
43
|
-
if n_rep == 1:
|
44
|
-
return hidden_states
|
45
|
-
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads,
|
46
|
-
n_rep, slen, head_dim)
|
47
|
-
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
48
|
-
|
49
|
-
|
50
|
-
def chatglm_rms_norm_forward(self, hidden_states):
|
51
|
-
if hidden_states.device.type == "xpu" and not (self.training and hidden_states.requires_grad):
|
52
|
-
import xe_addons
|
53
|
-
x_2d = hidden_states.reshape(-1, hidden_states.size(-1)).contiguous()
|
54
|
-
output = xe_addons.rms_norm(self.weight, x_2d, self.eps)
|
55
|
-
return output.reshape(hidden_states.shape)
|
56
|
-
|
57
|
-
input_dtype = hidden_states.dtype
|
58
|
-
hidden_states = hidden_states.to(torch.float32)
|
59
|
-
variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
60
|
-
hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
|
61
|
-
return self.weight * hidden_states.to(input_dtype)
|
62
|
-
|
63
|
-
|
64
36
|
def chatglm2_model_forward(
|
65
37
|
self,
|
66
38
|
input_ids,
|
@@ -91,8 +63,13 @@ def chatglm2_model_forward(
|
|
91
63
|
|
92
64
|
if use_cache:
|
93
65
|
use_compress_kv = should_use_compresskv(input_ids, input_ids.shape[1])
|
66
|
+
n_heads = self.config.num_attention_heads
|
67
|
+
if self.config.multi_query_attention:
|
68
|
+
n_kv_heads = self.config.multi_query_group_num
|
69
|
+
else:
|
70
|
+
n_kv_heads = n_heads
|
94
71
|
use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.gate_proj,
|
95
|
-
input_ids)
|
72
|
+
input_ids, n_heads, n_kv_heads)
|
96
73
|
if use_compress_kv and not isinstance(past_key_values,
|
97
74
|
DynamicCompressCache):
|
98
75
|
if use_quantize_kv:
|
@@ -285,8 +262,6 @@ def chatglm2_attention_forward(
|
|
285
262
|
key_states[..., :rot_dim] = k_rot[...]
|
286
263
|
|
287
264
|
# IPEX-LLM OPT: kv cache and quantize kv
|
288
|
-
use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states)
|
289
|
-
|
290
265
|
# [CompressKV]
|
291
266
|
if use_compresskv:
|
292
267
|
from transformers.configuration_utils import PretrainedConfig
|
@@ -300,6 +275,8 @@ def chatglm2_attention_forward(
|
|
300
275
|
self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH
|
301
276
|
)
|
302
277
|
else:
|
278
|
+
use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states,
|
279
|
+
n_head, n_kv_head)
|
303
280
|
key_states, value_states = update_past_key_value(
|
304
281
|
past_key_value, key_states, value_states,
|
305
282
|
kv_seq_len, use_quantize_kv, hidden_states.device
|
@@ -55,8 +55,13 @@ def chatglm4_model_forward(
|
|
55
55
|
if use_cache:
|
56
56
|
inputs = input_ids if input_ids is not None else inputs_embeds
|
57
57
|
use_compress_kv = should_use_compresskv(inputs, inputs.shape[1])
|
58
|
-
|
59
|
-
|
58
|
+
n_heads = self.config.num_attention_heads
|
59
|
+
if self.config.multi_query_attention:
|
60
|
+
n_kv_heads = self.config.multi_query_group_num
|
61
|
+
else:
|
62
|
+
n_kv_heads = n_heads
|
63
|
+
use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.gate_proj, inputs,
|
64
|
+
n_heads, n_kv_heads)
|
60
65
|
if use_compress_kv and not isinstance(past_key_values,
|
61
66
|
DynamicCompressCache):
|
62
67
|
if use_quantize_kv:
|
@@ -211,8 +216,6 @@ def chatglm4_attention_forward(
|
|
211
216
|
key_states[..., :rot_dim] = k_rot[...]
|
212
217
|
|
213
218
|
# IPEX-LLM OPT: kv cache and quantize kv
|
214
|
-
use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states)
|
215
|
-
|
216
219
|
# [CompressKV]
|
217
220
|
if use_compresskv:
|
218
221
|
from transformers.configuration_utils import PretrainedConfig
|
@@ -226,6 +229,8 @@ def chatglm4_attention_forward(
|
|
226
229
|
self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH
|
227
230
|
)
|
228
231
|
else:
|
232
|
+
use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states,
|
233
|
+
n_head, n_kv_head)
|
229
234
|
key_states, value_states = update_past_key_value(
|
230
235
|
past_key_value, key_states, value_states,
|
231
236
|
kv_seq_len, use_quantize_kv, hidden_states.device
|
@@ -230,7 +230,7 @@ def chatglm4v_attention_forward(
|
|
230
230
|
key_states[..., :rot_dim] = k_rot[...]
|
231
231
|
|
232
232
|
# IPEX-LLM OPT: kv cache and quantize kv
|
233
|
-
use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states)
|
233
|
+
use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states, n_head, n_kv_head)
|
234
234
|
key_states, value_states = update_past_key_value(
|
235
235
|
past_key_value, key_states, value_states,
|
236
236
|
kv_seq_len, use_quantize_kv, hidden_states.device
|
@@ -157,8 +157,10 @@ def rms_norm_forward(self, hidden_states: torch.Tensor):
|
|
157
157
|
weight = self.weight
|
158
158
|
if hasattr(self, "variance_epsilon"):
|
159
159
|
eps = self.variance_epsilon
|
160
|
-
|
160
|
+
elif hasattr(self, "epsilon"):
|
161
161
|
eps = self.epsilon
|
162
|
+
else:
|
163
|
+
eps = self.eps
|
162
164
|
|
163
165
|
if hidden_states.device.type == 'xpu' and hidden_states.dtype in [torch.float, torch.half]:
|
164
166
|
import xe_addons
|
@@ -147,7 +147,7 @@ def glm_model_forward_wrapper(origin_forward):
|
|
147
147
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
148
148
|
use_cache = use_cache or inputs.device.type == 'xpu'
|
149
149
|
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
|
150
|
-
self.config.num_attention_heads
|
150
|
+
self.config.num_attention_heads,
|
151
151
|
self.config.num_key_value_heads)
|
152
152
|
|
153
153
|
if use_cache:
|
@@ -87,7 +87,8 @@ def internlm_attention_forward(
|
|
87
87
|
)
|
88
88
|
|
89
89
|
# IPEX-LLM OPT: kv cache and quantzie kv cache
|
90
|
-
use_quantize_kv = use_quantize_kv_cache(self.qkv_proj, hidden_states
|
90
|
+
use_quantize_kv = use_quantize_kv_cache(self.qkv_proj, hidden_states,
|
91
|
+
self.num_heads, self.num_heads)
|
91
92
|
key_states, value_states = update_past_key_value(
|
92
93
|
past_key_value, key_states, value_states,
|
93
94
|
kv_seq_len, use_quantize_kv, hidden_states.device
|
@@ -171,7 +172,8 @@ def internlm2_attention_forward(
|
|
171
172
|
)
|
172
173
|
|
173
174
|
# IPEX-LLM OPT: kv cache and quantzie kv cache
|
174
|
-
use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states
|
175
|
+
use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states,
|
176
|
+
self.num_heads, self.num_key_value_heads)
|
175
177
|
key_states, value_states = update_past_key_value(
|
176
178
|
past_key_value, key_states, value_states,
|
177
179
|
kv_seq_len, use_quantize_kv, hidden_states.device
|
@@ -346,7 +348,8 @@ def internlm_xcomposser2_attention_forward(
|
|
346
348
|
query_states, key_states, cos, sin, position_ids, "internlm")
|
347
349
|
|
348
350
|
# IPEX-LLM OPT: kv cache and quantzie kv cache
|
349
|
-
use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states
|
351
|
+
use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states,
|
352
|
+
self.num_heads, self.num_key_value_heads)
|
350
353
|
key_states, value_states = update_past_key_value(
|
351
354
|
past_key_value, key_states, value_states,
|
352
355
|
kv_seq_len, use_quantize_kv, device
|
@@ -72,7 +72,7 @@ def llama_model_forward(
|
|
72
72
|
use_cache = True if inputs.device.type == "xpu" else use_cache
|
73
73
|
use_quantize_kv = use_quantize_kv_cache(
|
74
74
|
self.layers[0].mlp.down_proj, inputs,
|
75
|
-
self.config.num_attention_heads
|
75
|
+
self.config.num_attention_heads, self.config.num_key_value_heads
|
76
76
|
)
|
77
77
|
use_compresskv = should_use_compresskv(inputs, inputs.shape[1]) or \
|
78
78
|
isinstance(past_key_values, DynamicCompressCache)
|
@@ -159,7 +159,7 @@ def minicpm_model_forward_wrapper(origin_forward):
|
|
159
159
|
# IPEX-LLM OPT: kv cache and quantize kv cache
|
160
160
|
inputs = input_ids if input_ids is not None else inputs_embeds
|
161
161
|
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
|
162
|
-
self.config.num_attention_heads
|
162
|
+
self.config.num_attention_heads,
|
163
163
|
self.config.num_key_value_heads)
|
164
164
|
use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
|
165
165
|
isinstance(past_key_values, DynamicCompressCache)
|
@@ -66,7 +66,9 @@ def minicpm3_model_forward_wrapper(origin_forward):
|
|
66
66
|
inputs = input_ids if input_ids is not None else inputs_embeds
|
67
67
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
68
68
|
use_cache = True if inputs.device.type == "xpu" else use_cache
|
69
|
-
|
69
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
70
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
|
71
|
+
num_heads, num_kv_heads)
|
70
72
|
if use_cache:
|
71
73
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
72
74
|
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
@@ -71,7 +71,7 @@ def mistral_model_forward(
|
|
71
71
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
72
72
|
use_cache = use_cache or inputs.device.type == 'xpu'
|
73
73
|
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
|
74
|
-
self.config.num_attention_heads
|
74
|
+
self.config.num_attention_heads,
|
75
75
|
self.config.num_key_value_heads)
|
76
76
|
use_compress_kv = should_use_compresskv(inputs, inputs.size(1)) or \
|
77
77
|
isinstance(past_key_values, DynamicCompressCache)
|
@@ -113,7 +113,7 @@ def mllama_text_model_forward(
|
|
113
113
|
use_cache = True if inputs.device.type == "xpu" else use_cache
|
114
114
|
use_quantize_kv = use_quantize_kv_cache(
|
115
115
|
self.layers[0].mlp.down_proj, inputs,
|
116
|
-
self.config.num_attention_heads
|
116
|
+
self.config.num_attention_heads, self.config.num_key_value_heads
|
117
117
|
)
|
118
118
|
if use_cache:
|
119
119
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
@@ -249,7 +249,9 @@ def phi3_model_forward_wrapper(origin_model_forward):
|
|
249
249
|
# IPEX-LLM OPT: kv cache and quantize kv cache and sdp
|
250
250
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
251
251
|
inputs = input_ids if input_ids is not None else inputs_embeds
|
252
|
-
|
252
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
253
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
|
254
|
+
num_heads, num_kv_heads)
|
253
255
|
use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
|
254
256
|
isinstance(past_key_values, DynamicCompressCache)
|
255
257
|
if use_cache:
|
@@ -305,7 +307,9 @@ def phi3v_model_forward_wrapper(origin_model_forward):
|
|
305
307
|
):
|
306
308
|
# IPEX-LLM OPT: kv cache and quantize kv cache and sdp
|
307
309
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
308
|
-
|
310
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
311
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, input_ids,
|
312
|
+
num_heads, num_kv_heads)
|
309
313
|
if use_cache:
|
310
314
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
311
315
|
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
@@ -107,7 +107,8 @@ def qwen_attention_forward(
|
|
107
107
|
query_states = query_states * logn_tensor.type_as(query_states).expand_as(query_states)
|
108
108
|
|
109
109
|
# IPEX-LLM OPT: kv cache and quantzie kv cache
|
110
|
-
use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states
|
110
|
+
use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states,
|
111
|
+
self.num_heads, self.num_heads)
|
111
112
|
key_states, value_states = update_past_key_value(
|
112
113
|
past_key_value, key_states, value_states,
|
113
114
|
kv_seq_len, use_quantize_kv, device
|
@@ -205,7 +206,8 @@ def qwen_attention_forward_registered(
|
|
205
206
|
query_states = query_states * logn_tensor.type_as(query_states).expand_as(query_states)
|
206
207
|
|
207
208
|
# IPEX-LLM OPT: kv cache and quantzie kv cache
|
208
|
-
use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states
|
209
|
+
use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states,
|
210
|
+
self.num_heads, self.num_heads)
|
209
211
|
key_states, value_states = update_past_key_value(
|
210
212
|
past_key_value, key_states, value_states,
|
211
213
|
kv_seq_len, use_quantize_kv, device
|
@@ -113,10 +113,10 @@ def qwen2_model_forward(
|
|
113
113
|
# ipex-llm changes start
|
114
114
|
# IPEX-LLM OPT: kv cache and quantize kv cache
|
115
115
|
inputs = input_ids if input_ids is not None else inputs_embeds
|
116
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
116
117
|
use_quantize_kv = (
|
117
118
|
self.config.hidden_size != 3584 # disable quantize kv in specific model
|
118
|
-
and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
|
119
|
-
self.config.num_attention_heads//self.config.num_key_value_heads)
|
119
|
+
and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs, num_heads, num_kv_heads)
|
120
120
|
)
|
121
121
|
use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
|
122
122
|
isinstance(past_key_values, DynamicCompressCache)
|
@@ -305,10 +305,11 @@ def qwen2_model_forward_4_42(
|
|
305
305
|
|
306
306
|
# ipex-llm changes start
|
307
307
|
# IPEX-LLM OPT: kv cache and quantize kv cache
|
308
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
308
309
|
use_quantize_kv = (
|
309
310
|
self.config.hidden_size != 3584 # disable quantize kv in specific model
|
310
311
|
and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs_embeds,
|
311
|
-
|
312
|
+
num_heads, num_kv_heads)
|
312
313
|
)
|
313
314
|
use_compress_kv = should_use_compresskv(inputs_embeds, inputs_embeds.shape[1]) or \
|
314
315
|
isinstance(past_key_values, DynamicCompressCache)
|
@@ -73,8 +73,10 @@ def qwen2moe_model_forward(
|
|
73
73
|
return_dict: Optional[bool] = None,
|
74
74
|
):
|
75
75
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
76
|
-
|
77
|
-
|
76
|
+
inputs = input_ids if input_ids is not None else inputs_embeds
|
77
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
78
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, inputs,
|
79
|
+
num_heads, num_kv_heads)
|
78
80
|
if use_cache:
|
79
81
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
80
82
|
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
@@ -88,7 +88,9 @@ def qwen2_vl_model_forward(
|
|
88
88
|
# IPEX-LLM OPT start: kv cache and quantize kv cache
|
89
89
|
inputs = input_ids if input_ids is not None else inputs_embeds
|
90
90
|
use_cache = True if inputs.device.type == "xpu" else use_cache
|
91
|
-
|
91
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
92
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
|
93
|
+
num_heads, num_kv_heads)
|
92
94
|
if use_cache:
|
93
95
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
94
96
|
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
@@ -69,8 +69,10 @@ def stablelm_model_forward(
|
|
69
69
|
):
|
70
70
|
# IPEX-LLM OPT: kv cache and quantize kv cache
|
71
71
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
72
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
72
73
|
use_quantize_kv = (self.layers[0].self_attn.head_dim in [64, 80, 96, 128]
|
73
|
-
and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids
|
74
|
+
and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids,
|
75
|
+
num_heads, num_kv_heads))
|
74
76
|
if use_cache:
|
75
77
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
76
78
|
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
@@ -132,7 +132,9 @@ def model_forward(
|
|
132
132
|
return_dict: Optional[bool] = None,
|
133
133
|
):
|
134
134
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
135
|
-
|
135
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
136
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.c_fc, input_ids,
|
137
|
+
num_heads, num_kv_heads)
|
136
138
|
if use_cache:
|
137
139
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
138
140
|
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
@@ -74,7 +74,8 @@ def append_kv_cache(cache_k, cache_v, key_states, value_states):
|
|
74
74
|
return new_cache_k, new_cache_v
|
75
75
|
|
76
76
|
|
77
|
-
def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
|
77
|
+
def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
|
78
|
+
num_heads: int, num_kv_heads: int) -> bool:
|
78
79
|
if os.environ.get("BIGDL_QUANTIZE_KV_CACHE", None) is not None:
|
79
80
|
warnings.warn(
|
80
81
|
"`BIGDL_QUANTIZE_KV_CACHE` is deprecated and will be removed in future releases. "
|
@@ -90,8 +91,11 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: in
|
|
90
91
|
else:
|
91
92
|
device_name = get_xpu_device_name(x.device)
|
92
93
|
return (
|
93
|
-
|
94
|
-
|
94
|
+
num_kv_heads >= 4
|
95
|
+
and (
|
96
|
+
device_name in ["mtl", "lnl", "arl"] and num_heads // num_kv_heads <= 4
|
97
|
+
or device_name in ["arc", "bmg"] and x.size(0) > 1
|
98
|
+
)
|
95
99
|
)
|
96
100
|
|
97
101
|
|
@@ -158,7 +158,8 @@ def yuan_attention_forward(
|
|
158
158
|
"yuan")
|
159
159
|
|
160
160
|
# IPEX-LLM OPT: kv cache and quantzie kv cache
|
161
|
-
use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states
|
161
|
+
use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states,
|
162
|
+
self.num_heads, self.num_heads)
|
162
163
|
key_states, value_states = update_past_key_value(
|
163
164
|
None if past_key_value is None else (past_key_value[0], past_key_value[1]),
|
164
165
|
key_states, value_states,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ipex-llm
|
3
|
-
Version: 2.2.
|
3
|
+
Version: 2.2.0b20250109
|
4
4
|
Summary: Large Language Model Develop Toolkit
|
5
5
|
Home-page: https://github.com/intel-analytics/ipex-llm
|
6
6
|
Author: BigDL Authors
|
@@ -27,10 +27,10 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
|
|
27
27
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
|
28
28
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
|
29
29
|
Provides-Extra: cpp
|
30
|
-
Requires-Dist: bigdl-core-cpp ==2.6.
|
30
|
+
Requires-Dist: bigdl-core-cpp ==2.6.0b20250109 ; extra == 'cpp'
|
31
31
|
Requires-Dist: setuptools ; extra == 'cpp'
|
32
32
|
Provides-Extra: cpp-arl
|
33
|
-
Requires-Dist: bigdl-core-cpp ==2.6.
|
33
|
+
Requires-Dist: bigdl-core-cpp ==2.6.0b20250109 ; extra == 'cpp-arl'
|
34
34
|
Requires-Dist: setuptools ; extra == 'cpp-arl'
|
35
35
|
Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
36
36
|
Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
@@ -67,7 +67,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
|
|
67
67
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
|
68
68
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
|
69
69
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
|
70
|
-
Requires-Dist: bigdl-core-npu ==2.6.
|
70
|
+
Requires-Dist: bigdl-core-npu ==2.6.0b20250109 ; (platform_system == "Windows") and extra == 'npu'
|
71
71
|
Provides-Extra: serving
|
72
72
|
Requires-Dist: py-cpuinfo ; extra == 'serving'
|
73
73
|
Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
|
@@ -87,9 +87,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
|
|
87
87
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
|
88
88
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
|
89
89
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
|
90
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
91
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
92
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
90
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250109 ; extra == 'xpu'
|
91
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250109 ; extra == 'xpu'
|
92
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250109 ; extra == 'xpu'
|
93
93
|
Provides-Extra: xpu-2-1
|
94
94
|
Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
|
95
95
|
Requires-Dist: protobuf ; extra == 'xpu-2-1'
|
@@ -104,9 +104,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
|
|
104
104
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
|
105
105
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
|
106
106
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
|
107
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
108
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
109
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
107
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
|
108
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
|
109
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
|
110
110
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
|
111
111
|
Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
112
112
|
Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
@@ -124,7 +124,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
|
|
124
124
|
Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
|
125
125
|
Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
|
126
126
|
Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
|
127
|
-
Requires-Dist: bigdl-core-xe-all ==2.6.
|
127
|
+
Requires-Dist: bigdl-core-xe-all ==2.6.0b20250109 ; extra == 'xpu-2-6'
|
128
128
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-6'
|
129
129
|
Provides-Extra: xpu-arc
|
130
130
|
Requires-Dist: py-cpuinfo ; extra == 'xpu-arc'
|
@@ -137,9 +137,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
|
|
137
137
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
|
138
138
|
Requires-Dist: tabulate ; extra == 'xpu-arc'
|
139
139
|
Requires-Dist: setuptools ; extra == 'xpu-arc'
|
140
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
141
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
142
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
140
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
|
141
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
|
142
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
|
143
143
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
|
144
144
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
145
145
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
@@ -160,9 +160,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
|
|
160
160
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
|
161
161
|
Requires-Dist: tabulate ; extra == 'xpu-arl'
|
162
162
|
Requires-Dist: setuptools ; extra == 'xpu-arl'
|
163
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
164
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
165
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
163
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
|
164
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
|
165
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
|
166
166
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
|
167
167
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
168
168
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
@@ -183,9 +183,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
|
|
183
183
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
|
184
184
|
Requires-Dist: tabulate ; extra == 'xpu-lnl'
|
185
185
|
Requires-Dist: setuptools ; extra == 'xpu-lnl'
|
186
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
187
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
188
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
186
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
|
187
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
|
188
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
|
189
189
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
|
190
190
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|
191
191
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|
@@ -94,7 +94,7 @@ ipex_llm/serving/fastchat/tgi_api_protocol.py,sha256=brT3k3-V0NJrU4fRqUwWjC0O3iO
|
|
94
94
|
ipex_llm/serving/fastchat/tgi_api_server.py,sha256=agNTAEiZPSuj3dEdIdYKwkoY0cXOUDX06DiM9VP2knQ,24418
|
95
95
|
ipex_llm/serving/fastchat/vllm_worker.py,sha256=ZLz2Q9GxJO6r_LOiP6epgCRjBGk-K4EB1SNEWSJp5DA,11091
|
96
96
|
ipex_llm/transformers/__init__.py,sha256=l4KkMkLe-pRC7b_kj6LCfeifgE-Uo33_Av_FwN9HnFA,1074
|
97
|
-
ipex_llm/transformers/convert.py,sha256=
|
97
|
+
ipex_llm/transformers/convert.py,sha256=umI137wqV2d4itS0AJQoZcygeWBATpSJSDJ805cZ-SY,98499
|
98
98
|
ipex_llm/transformers/convert_ipex.py,sha256=iKXo0n8fVFTOA2fNYYrByMFK0dovL-kLd2sVDk88AlQ,14334
|
99
99
|
ipex_llm/transformers/embedding.py,sha256=bdgk59DvD4ZZyxRzewXOR7g56nThgO6uhIwk8QL7f-s,9299
|
100
100
|
ipex_llm/transformers/kv.py,sha256=k4TU18LlA-Sbq9WNNQnfuzu3RSFBwFhmaV3BcGN5bAo,19191
|
@@ -144,45 +144,45 @@ ipex_llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py,sh
|
|
144
144
|
ipex_llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py,sha256=_AOGMV65XHxgTxIib7lgs49InopcecTzRwgtYR8NTUg,51084
|
145
145
|
ipex_llm/transformers/models/__init__.py,sha256=tp2DcVkKg1-QvdYk7DY7rZvQWCDQ4ZjU8NAQ7Fclrpg,584
|
146
146
|
ipex_llm/transformers/models/aquila.py,sha256=VZb5Drpo_fTxwcExZ397LygnsNPX2sVbie9_JeFudZI,5252
|
147
|
-
ipex_llm/transformers/models/baichuan.py,sha256=
|
148
|
-
ipex_llm/transformers/models/bert.py,sha256=
|
147
|
+
ipex_llm/transformers/models/baichuan.py,sha256=cAQLmVG-3R8CSTGTcDy2JOOzVe-Ej8AXjIEIjvZBGlo,18376
|
148
|
+
ipex_llm/transformers/models/bert.py,sha256=0Mm9jkvkzBxtc_z_GE1TcZoPz-HOg2Z2973ZEWgSwJk,5601
|
149
149
|
ipex_llm/transformers/models/bloom.py,sha256=PxfzyYT-nFn3K5rZhTQjmcEjUUzAhUFzxIN4kzRlCuc,8103
|
150
150
|
ipex_llm/transformers/models/chatglm.py,sha256=UHai1t2AUtGmF765_eHF8LUMVQzp_oCBx8TJB21WrHk,12597
|
151
|
-
ipex_llm/transformers/models/chatglm2.py,sha256=
|
152
|
-
ipex_llm/transformers/models/chatglm4.py,sha256=
|
153
|
-
ipex_llm/transformers/models/chatglm4v.py,sha256=
|
154
|
-
ipex_llm/transformers/models/common.py,sha256=
|
151
|
+
ipex_llm/transformers/models/chatglm2.py,sha256=KyAIX7zGVQDQuwwM3QMBNWZbTeMHEzKUIgAryT0voHc,14933
|
152
|
+
ipex_llm/transformers/models/chatglm4.py,sha256=QvUehdaCePB3MNHyWg3dneDxmjtBdxYeKUyQUVcsgfM,16886
|
153
|
+
ipex_llm/transformers/models/chatglm4v.py,sha256=L6y45M_wjS2_HqchmCUxRlQZUNuSNCGOiynAQrGh918,14124
|
154
|
+
ipex_llm/transformers/models/common.py,sha256=Q3IEfGqvxoHyfIIF5s8qHmOJBBP3b2jyVAXk8C3b1Pg,11636
|
155
155
|
ipex_llm/transformers/models/decilm.py,sha256=P-PBuDPf07GvKggLwJx_wPwIn6esN3rX8ai2JxRuZmE,5246
|
156
156
|
ipex_llm/transformers/models/gemma.py,sha256=_E3Yw8Y45xyNVeLqyVKcpr8kjuICtETeL82cJ-bWJuU,9424
|
157
157
|
ipex_llm/transformers/models/gemma2.py,sha256=2WZuv-FLzJyTJFaYxOuzJt47QE64M0lHnzAiO5T6ozI,8049
|
158
|
-
ipex_llm/transformers/models/glm.py,sha256=
|
158
|
+
ipex_llm/transformers/models/glm.py,sha256=lmeEWd_W2O638VzVW4Gm6cJre5XZcg_QBmPs8NWqXsM,7202
|
159
159
|
ipex_llm/transformers/models/gpt2.py,sha256=YSaNgK1uLCFDuIFqnKO0Mi-AsOZsYav-7pNf_NpKGdM,3445
|
160
160
|
ipex_llm/transformers/models/gptbigcode.py,sha256=cP1_qGWoa43R2WacAMblShjku4QupcCZiLaPPAoOUs4,9101
|
161
161
|
ipex_llm/transformers/models/gptneox.py,sha256=loRh1x_5S6BCeOr_s5xr-N_1SQHL3Y5IiUBAEyoMUqQ,6172
|
162
|
-
ipex_llm/transformers/models/internlm.py,sha256=
|
162
|
+
ipex_llm/transformers/models/internlm.py,sha256=OifyiobRligleyZLpLBSe44A6Sq0uMG-8-NOcRCcT4Q,18080
|
163
163
|
ipex_llm/transformers/models/internvl.py,sha256=Vx0vENIEQLX2M6P398mw5TOhpks0U8xf8rtRQvy94go,8154
|
164
|
-
ipex_llm/transformers/models/llama.py,sha256=
|
165
|
-
ipex_llm/transformers/models/minicpm.py,sha256=
|
166
|
-
ipex_llm/transformers/models/minicpm3.py,sha256=
|
164
|
+
ipex_llm/transformers/models/llama.py,sha256=NzpyQve_RC9ez1W-jWPLGZ80k_S1I5Rx5saAzCsDIoI,8558
|
165
|
+
ipex_llm/transformers/models/minicpm.py,sha256=eaPNVNrep0_xGoELhZd886ff0ceoKqB6cusdAhd52eE,10145
|
166
|
+
ipex_llm/transformers/models/minicpm3.py,sha256=11cYl8KM2hoIJNMAOZMxiwCu6dMhup9ric_OEn8-VrQ,9363
|
167
167
|
ipex_llm/transformers/models/minicpmv.py,sha256=PP05b5iTnrMpiseCn8iJcxKJDnfq7WqXp9Mrch0kKZ0,9876
|
168
|
-
ipex_llm/transformers/models/mistral.py,sha256=
|
169
|
-
ipex_llm/transformers/models/mllama.py,sha256=
|
168
|
+
ipex_llm/transformers/models/mistral.py,sha256=uVhkdXaq15v1P3QY0emVsA7SxUbAWChHEEXYN-drjpQ,7449
|
169
|
+
ipex_llm/transformers/models/mllama.py,sha256=ZyRq9DTKsvk1AlRbr-z6ngjS3Sr_7YuGZ6-Yr1MBBAM,10937
|
170
170
|
ipex_llm/transformers/models/mpt.py,sha256=z02NwHogJZVh-Mk4sYoIzR90SFIKhoNN_-ifsD907TQ,9540
|
171
171
|
ipex_llm/transformers/models/phi.py,sha256=E6qz4EEuHIVGvaPo-wtLC5lz3iyMqTbAE_cRlcjQRKI,6670
|
172
|
-
ipex_llm/transformers/models/phi3.py,sha256=
|
172
|
+
ipex_llm/transformers/models/phi3.py,sha256=Fo6PlZ24Gdm7eeeZOTMm1Bfh3U6P4rvq7-_2FHvp0vE,15503
|
173
173
|
ipex_llm/transformers/models/phixtral.py,sha256=MDTMghcu7qAmZmRcUGqXXDXhSU3y_N59HRIXmlcjp5g,4890
|
174
|
-
ipex_llm/transformers/models/qwen.py,sha256=
|
175
|
-
ipex_llm/transformers/models/qwen2.py,sha256=
|
176
|
-
ipex_llm/transformers/models/qwen2_moe.py,sha256=
|
177
|
-
ipex_llm/transformers/models/qwen2_vl.py,sha256=
|
174
|
+
ipex_llm/transformers/models/qwen.py,sha256=A3WiVCzA7NLkcjp4zhFkZvKZzZWZlg0WFuVV_556TAI,19543
|
175
|
+
ipex_llm/transformers/models/qwen2.py,sha256=JLaY9ZT7A22oO0G8K-nvjvKQDaIrKA5o-jEHvk_y3eI,25604
|
176
|
+
ipex_llm/transformers/models/qwen2_moe.py,sha256=a0gYo-ngf8SxaEnBdZUJDnPS6Mkn_poDd8xqhx50icI,19516
|
177
|
+
ipex_llm/transformers/models/qwen2_vl.py,sha256=NrhxlaPj7W-HUBmKc3CSTwZy1lkoZ9qDaxM4GvE0kHs,13583
|
178
178
|
ipex_llm/transformers/models/qwen_vl.py,sha256=j7Nzzz2Qvynu9yrCXmoEfERjw43hXof5TbXIs7Ms-oY,17105
|
179
179
|
ipex_llm/transformers/models/rwkv4.py,sha256=H4KMtxN0JA2ZTXnonHpsUUJ5xULemo-D1Jzl0ri_UY8,6123
|
180
180
|
ipex_llm/transformers/models/rwkv5.py,sha256=OkRNj1pCAZg1z2Fw-I0DEnxLEdZyPeRSQ6msrkxLOCs,10710
|
181
181
|
ipex_llm/transformers/models/sd.py,sha256=VvHV5u-0k2MgHu3NL9113hPj7DgfxqctuKzEEeNfRDU,5981
|
182
|
-
ipex_llm/transformers/models/stablelm.py,sha256=
|
183
|
-
ipex_llm/transformers/models/starcoder2.py,sha256=
|
184
|
-
ipex_llm/transformers/models/utils.py,sha256=
|
185
|
-
ipex_llm/transformers/models/yuan.py,sha256=
|
182
|
+
ipex_llm/transformers/models/stablelm.py,sha256=fj-XtOnR6kggnFUQTMPCOOzolkPztN06WAv8QW-XRnI,7054
|
183
|
+
ipex_llm/transformers/models/starcoder2.py,sha256=ONKvD7JCkRM0DI-R56x28QFBJ7CjD5hOZBQ_3WfOcNk,6626
|
184
|
+
ipex_llm/transformers/models/utils.py,sha256=ihbWS5kQK2KHDVPkMhgjik3nM8B2fWf-E-z4BWNUstk,15568
|
185
|
+
ipex_llm/transformers/models/yuan.py,sha256=JYAn_ZaSGK0NBJLEIxCACfAq084a66GFJkdd5NbpmMA,7732
|
186
186
|
ipex_llm/transformers/npu_models/__init__.py,sha256=ulEUGLjaP48LCrVeury3UxLjXxKzRi0UpSG4bYu-7f8,585
|
187
187
|
ipex_llm/transformers/npu_models/baichuan.py,sha256=fJtd7fBrttySghRUgfZTAdxLjsSNC-XL08HISsXigLE,4685
|
188
188
|
ipex_llm/transformers/npu_models/baichuan_mp.py,sha256=tHhO-0v5z6IhxsfzAPYWXVbLrV_4z89DIb4JjE3207M,45026
|
@@ -250,11 +250,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
|
|
250
250
|
ipex_llm/vllm/xpu/engine/engine.py,sha256=k4-D27WS_Gk3mA--w3HWAjPjb4Aiu043MVPi0ZoAUBc,5984
|
251
251
|
ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=GshTZFB8e4PWvqckfbmTOU6b0oLkNn7A-vzLuG9--j8,21544
|
252
252
|
ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=2rENA2ucynMaIjiZBEh2ez1o5vR32GaP514t39CD7KM,8676
|
253
|
-
ipex_llm-2.2.
|
254
|
-
ipex_llm-2.2.
|
255
|
-
ipex_llm-2.2.
|
256
|
-
ipex_llm-2.2.
|
257
|
-
ipex_llm-2.2.
|
258
|
-
ipex_llm-2.2.
|
259
|
-
ipex_llm-2.2.
|
260
|
-
ipex_llm-2.2.
|
253
|
+
ipex_llm-2.2.0b20250109.data/scripts/ipex-llm-init,sha256=fLQsT2dRL6H5bThb4GuIWotAuqoLsIxFwA-0c2qmaO8,6672
|
254
|
+
ipex_llm-2.2.0b20250109.data/scripts/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
|
255
|
+
ipex_llm-2.2.0b20250109.data/scripts/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
|
256
|
+
ipex_llm-2.2.0b20250109.dist-info/METADATA,sha256=gPslIWSw_X5E5ULhQa8rOHeRo_UeBDXCAyPjBSPB-nU,12705
|
257
|
+
ipex_llm-2.2.0b20250109.dist-info/WHEEL,sha256=PPJcBMAZibF_2GFE9NmOJGqiaSMPiNFbJd6QaJjdA6Y,109
|
258
|
+
ipex_llm-2.2.0b20250109.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
|
259
|
+
ipex_llm-2.2.0b20250109.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
|
260
|
+
ipex_llm-2.2.0b20250109.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|