ipex-llm 2.2.0b20250108__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250109__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. ipex_llm/transformers/convert.py +5 -13
  2. ipex_llm/transformers/models/baichuan.py +7 -36
  3. ipex_llm/transformers/models/bert.py +2 -13
  4. ipex_llm/transformers/models/chatglm2.py +8 -31
  5. ipex_llm/transformers/models/chatglm4.py +9 -4
  6. ipex_llm/transformers/models/chatglm4v.py +1 -1
  7. ipex_llm/transformers/models/common.py +3 -1
  8. ipex_llm/transformers/models/glm.py +1 -1
  9. ipex_llm/transformers/models/internlm.py +6 -3
  10. ipex_llm/transformers/models/llama.py +1 -1
  11. ipex_llm/transformers/models/minicpm.py +1 -1
  12. ipex_llm/transformers/models/minicpm3.py +3 -1
  13. ipex_llm/transformers/models/mistral.py +1 -1
  14. ipex_llm/transformers/models/mllama.py +1 -1
  15. ipex_llm/transformers/models/phi3.py +6 -2
  16. ipex_llm/transformers/models/qwen.py +4 -2
  17. ipex_llm/transformers/models/qwen2.py +4 -3
  18. ipex_llm/transformers/models/qwen2_moe.py +4 -2
  19. ipex_llm/transformers/models/qwen2_vl.py +3 -1
  20. ipex_llm/transformers/models/stablelm.py +3 -1
  21. ipex_llm/transformers/models/starcoder2.py +3 -1
  22. ipex_llm/transformers/models/utils.py +7 -3
  23. ipex_llm/transformers/models/yuan.py +2 -1
  24. {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/METADATA +20 -20
  25. {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/RECORD +31 -31
  26. {ipex_llm-2.2.0b20250108.data → ipex_llm-2.2.0b20250109.data}/scripts/ipex-llm-init +0 -0
  27. {ipex_llm-2.2.0b20250108.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-chat +0 -0
  28. {ipex_llm-2.2.0b20250108.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-cli +0 -0
  29. {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/WHEEL +0 -0
  30. {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/entry_points.txt +0 -0
  31. {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/top_level.txt +0 -0
@@ -1325,7 +1325,6 @@ def _optimize_post(model):
1325
1325
  modeling_module_name = model.__class__.__module__
1326
1326
  module = importlib.import_module(modeling_module_name)
1327
1327
  from ipex_llm.transformers.models.chatglm2 import chatglm2_attention_forward
1328
- from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
1329
1328
  from ipex_llm.transformers.models.chatglm2 import chatglm2_encoder_forward
1330
1329
  from ipex_llm.transformers.models.chatglm2 import chatglm2_model_forward
1331
1330
  from ipex_llm.transformers.models.chatglm2 import mlp_forward
@@ -1338,9 +1337,7 @@ def _optimize_post(model):
1338
1337
  convert_forward(model,
1339
1338
  module.ChatGLMModel,
1340
1339
  chatglm2_model_forward)
1341
- convert_forward(model,
1342
- module.RMSNorm,
1343
- chatglm_rms_norm_forward)
1340
+ convert_forward(model, module.RMSNorm, rms_norm_forward)
1344
1341
  convert_forward(model, module.MLP, mlp_forward)
1345
1342
  # for codegeex-nano
1346
1343
  if hasattr(model.config, "rope_ratio"):
@@ -1358,8 +1355,7 @@ def _optimize_post(model):
1358
1355
  # glm4 family
1359
1356
  modeling_module_name = model.__class__.__module__
1360
1357
  module = importlib.import_module(modeling_module_name)
1361
- from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
1362
- convert_forward(model, module.RMSNorm, chatglm_rms_norm_forward)
1358
+ convert_forward(model, module.RMSNorm, rms_norm_forward)
1363
1359
 
1364
1360
  if hasattr(model.transformer, "vision"):
1365
1361
  # glm4 vision family
@@ -1448,8 +1444,8 @@ def _optimize_post(model):
1448
1444
  elif model.config.model_type == "baichuan":
1449
1445
  modeling_module_name = model.__class__.__module__
1450
1446
  module = importlib.import_module(modeling_module_name)
1451
- from ipex_llm.transformers.models.baichuan import baichuan_mlp_forward
1452
- convert_forward(model, module.MLP, baichuan_mlp_forward)
1447
+ convert_forward(model, module.RMSNorm, rms_norm_forward)
1448
+ convert_forward(model, module.MLP, mlp_silu_forward)
1453
1449
 
1454
1450
  if model.config.hidden_size in [4096, 2048]:
1455
1451
  # baichuan-7B and baichuan2-7B
@@ -1458,7 +1454,6 @@ def _optimize_post(model):
1458
1454
  for i in range(len(model.model.layers)):
1459
1455
  setattr(model.model.layers[i].self_attn, "layer_idx", i)
1460
1456
  convert_forward(model, module.Attention, baichuan_attention_forward_7b)
1461
- convert_forward(model, module.RMSNorm, rms_norm_forward)
1462
1457
  if model.config.vocab_size == 125696:
1463
1458
  # baichuan2-7B
1464
1459
  convert_forward(model, module.BaichuanModel, baichuan_model_7b_forward)
@@ -1468,9 +1463,7 @@ def _optimize_post(model):
1468
1463
  elif model.config.hidden_size == 5120:
1469
1464
  # baichuan-13B and baichuan2-13B
1470
1465
  from ipex_llm.transformers.models.baichuan import baichuan_attention_forward_13b
1471
- from ipex_llm.transformers.models.baichuan import baichuan_13b_rms_norm_forward
1472
1466
  convert_forward(model, module.BaichuanAttention, baichuan_attention_forward_13b)
1473
- convert_forward(model, module.RMSNorm, baichuan_13b_rms_norm_forward)
1474
1467
 
1475
1468
  if model.config.vocab_size == 125696:
1476
1469
  # baichaun2-13B
@@ -1565,7 +1558,6 @@ def _optimize_post(model):
1565
1558
  from ipex_llm.transformers.models.qwen import qwen_attention_forward
1566
1559
  from ipex_llm.transformers.models.qwen import qwen_attention_forward_registered
1567
1560
  from ipex_llm.transformers.models.qwen import qwen_mlp_forward
1568
- from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
1569
1561
  from ipex_llm.transformers.models.qwen import qwen_model_forward
1570
1562
  if model.config.max_position_embeddings == 8192 \
1571
1563
  and model.config.hidden_size == 4096:
@@ -1580,7 +1572,7 @@ def _optimize_post(model):
1580
1572
  )
1581
1573
  convert_forward(model,
1582
1574
  module.RMSNorm,
1583
- chatglm_rms_norm_forward)
1575
+ rms_norm_forward)
1584
1576
  convert_forward(model,
1585
1577
  module.QWenMLP,
1586
1578
  qwen_mlp_forward)
@@ -47,38 +47,6 @@ def pre_compute_inv_freq(module: torch.nn.Module):
47
47
  module.register_buffer("inv_freq", inv_freq, persistent=False)
48
48
 
49
49
 
50
- def baichuan_13b_rms_norm_forward(self, hidden_states):
51
- if hidden_states.device.type == "xpu" and not (self.training or hidden_states.requires_grad):
52
- import xe_addons
53
- x_2d = hidden_states.reshape(-1, hidden_states.size(-1)).contiguous()
54
- output = xe_addons.rms_norm(self.weight, x_2d, self.epsilon)
55
- return output.reshape(hidden_states.shape)
56
-
57
- input_dtype = hidden_states.dtype
58
- hidden_states = hidden_states.to(torch.float32)
59
- variance = hidden_states.pow(2).mean(-1, keepdim=True)
60
- hidden_states = hidden_states * torch.rsqrt(variance + self.epsilon)
61
- return self.weight * hidden_states.to(input_dtype)
62
-
63
-
64
- def baichuan_mlp_forward(
65
- self,
66
- x: torch.Tensor,
67
- ) -> torch.Tensor:
68
- x_2d = x.view(-1, x.shape[-1])
69
- qtype = getattr(self.gate_proj, "qtype", None)
70
- if mlp_fusion_check(x_2d, qtype, self.training):
71
- import xe_linear
72
- if not x_2d.is_contiguous():
73
- x_2d = x_2d.contiguous()
74
- return self.down_proj(xe_linear.mlp_forward_xpu(
75
- x_2d, self.gate_proj.weight.data, self.up_proj.weight.data,
76
- x_2d.shape[0], x_2d.shape[1], self.gate_proj.out_len,
77
- SILU, qtype
78
- ))
79
- return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
80
-
81
-
82
50
  def baichuan_model_7b_forward(
83
51
  self,
84
52
  input_ids: torch.LongTensor = None,
@@ -105,7 +73,9 @@ def baichuan_model_7b_forward(
105
73
  if use_cache:
106
74
  inputs = input_ids if input_ids is not None else inputs_embeds
107
75
  use_compress_kv = should_use_compresskv(inputs, inputs.shape[1])
108
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs)
76
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
77
+ self.config.num_attention_heads,
78
+ self.config.num_attention_heads)
109
79
  if use_compress_kv and not isinstance(past_key_values,
110
80
  DynamicCompressCache):
111
81
  if use_quantize_kv:
@@ -278,8 +248,6 @@ def baichuan_attention_forward_7b(
278
248
  key_states = key_states.to(hidden_states.dtype)
279
249
 
280
250
  # IPEX-LLM OPT: kv cache and quantize kv
281
- use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states)
282
-
283
251
  # [CompressKV]
284
252
  if use_compresskv:
285
253
  enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value,
@@ -290,6 +258,8 @@ def baichuan_attention_forward_7b(
290
258
  query_states, attention_mask, 1,
291
259
  self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH)
292
260
  else:
261
+ use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states,
262
+ self.num_heads, self.num_heads)
293
263
  key_states, value_states = update_past_key_value(
294
264
  past_key_value, key_states, value_states,
295
265
  kv_seq_len, use_quantize_kv, device
@@ -340,7 +310,8 @@ def baichuan_attention_forward_13b(
340
310
  kv_seq_len += past_key_value[0].shape[2]
341
311
 
342
312
  # IPEX-LLM OPT: kv cache and quantize kv
343
- use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states)
313
+ use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states,
314
+ self.num_heads, self.num_heads)
344
315
  key_states, value_states = update_past_key_value(
345
316
  past_key_value, key_states, value_states,
346
317
  kv_seq_len, use_quantize_kv, device
@@ -36,24 +36,13 @@ import math
36
36
  import torch
37
37
  from typing import Optional, Tuple
38
38
  from transformers.models.bert.modeling_bert import BertSelfAttention, BertEncoder
39
+ from ipex_llm.transformers.models.common import merge_linear
39
40
  from ipex_llm.utils.common import invalidInputError
40
41
 
41
42
 
42
43
  def merge_qkv(module: torch.nn.Module):
43
44
  if isinstance(module, BertSelfAttention):
44
- q_w = module.query.weight.data
45
- k_w = module.key.weight.data
46
- v_w = module.value.weight.data
47
- q_b = module.query.bias.data
48
- k_b = module.key.bias.data
49
- v_b = module.value.bias.data
50
- new_w = torch.cat([q_w, k_w, v_w], dim=0)
51
- new_b = torch.cat([q_b, k_b, v_b], dim=-1)
52
- qkv = torch.nn.Linear(0, 0, bias=True)
53
- qkv.weight = torch.nn.Parameter(new_w, requires_grad=False)
54
- qkv.bias = torch.nn.Parameter(new_b, requires_grad=False)
55
- qkv.in_features = module.query.in_features
56
- qkv.out_features = module.query.out_features * 3
45
+ qkv = merge_linear([module.query, module.key, module.value])
57
46
  module.qkv = qkv
58
47
  del module.query
59
48
  del module.key
@@ -33,34 +33,6 @@ from ipex_llm.transformers.kv import DynamicCompressCache, DynamicCompressFp8Cac
33
33
  KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256))
34
34
 
35
35
 
36
- def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
37
- """
38
- This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states
39
- go from (batch, num_key_value_heads, seqlen, head_dim) to
40
- (batch, num_attention_heads, seqlen, head_dim)
41
- """
42
- batch, num_key_value_heads, slen, head_dim = hidden_states.shape
43
- if n_rep == 1:
44
- return hidden_states
45
- hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads,
46
- n_rep, slen, head_dim)
47
- return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
48
-
49
-
50
- def chatglm_rms_norm_forward(self, hidden_states):
51
- if hidden_states.device.type == "xpu" and not (self.training and hidden_states.requires_grad):
52
- import xe_addons
53
- x_2d = hidden_states.reshape(-1, hidden_states.size(-1)).contiguous()
54
- output = xe_addons.rms_norm(self.weight, x_2d, self.eps)
55
- return output.reshape(hidden_states.shape)
56
-
57
- input_dtype = hidden_states.dtype
58
- hidden_states = hidden_states.to(torch.float32)
59
- variance = hidden_states.pow(2).mean(-1, keepdim=True)
60
- hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
61
- return self.weight * hidden_states.to(input_dtype)
62
-
63
-
64
36
  def chatglm2_model_forward(
65
37
  self,
66
38
  input_ids,
@@ -91,8 +63,13 @@ def chatglm2_model_forward(
91
63
 
92
64
  if use_cache:
93
65
  use_compress_kv = should_use_compresskv(input_ids, input_ids.shape[1])
66
+ n_heads = self.config.num_attention_heads
67
+ if self.config.multi_query_attention:
68
+ n_kv_heads = self.config.multi_query_group_num
69
+ else:
70
+ n_kv_heads = n_heads
94
71
  use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.gate_proj,
95
- input_ids)
72
+ input_ids, n_heads, n_kv_heads)
96
73
  if use_compress_kv and not isinstance(past_key_values,
97
74
  DynamicCompressCache):
98
75
  if use_quantize_kv:
@@ -285,8 +262,6 @@ def chatglm2_attention_forward(
285
262
  key_states[..., :rot_dim] = k_rot[...]
286
263
 
287
264
  # IPEX-LLM OPT: kv cache and quantize kv
288
- use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states)
289
-
290
265
  # [CompressKV]
291
266
  if use_compresskv:
292
267
  from transformers.configuration_utils import PretrainedConfig
@@ -300,6 +275,8 @@ def chatglm2_attention_forward(
300
275
  self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH
301
276
  )
302
277
  else:
278
+ use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states,
279
+ n_head, n_kv_head)
303
280
  key_states, value_states = update_past_key_value(
304
281
  past_key_value, key_states, value_states,
305
282
  kv_seq_len, use_quantize_kv, hidden_states.device
@@ -55,8 +55,13 @@ def chatglm4_model_forward(
55
55
  if use_cache:
56
56
  inputs = input_ids if input_ids is not None else inputs_embeds
57
57
  use_compress_kv = should_use_compresskv(inputs, inputs.shape[1])
58
- use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.gate_proj,
59
- inputs)
58
+ n_heads = self.config.num_attention_heads
59
+ if self.config.multi_query_attention:
60
+ n_kv_heads = self.config.multi_query_group_num
61
+ else:
62
+ n_kv_heads = n_heads
63
+ use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.gate_proj, inputs,
64
+ n_heads, n_kv_heads)
60
65
  if use_compress_kv and not isinstance(past_key_values,
61
66
  DynamicCompressCache):
62
67
  if use_quantize_kv:
@@ -211,8 +216,6 @@ def chatglm4_attention_forward(
211
216
  key_states[..., :rot_dim] = k_rot[...]
212
217
 
213
218
  # IPEX-LLM OPT: kv cache and quantize kv
214
- use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states)
215
-
216
219
  # [CompressKV]
217
220
  if use_compresskv:
218
221
  from transformers.configuration_utils import PretrainedConfig
@@ -226,6 +229,8 @@ def chatglm4_attention_forward(
226
229
  self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH
227
230
  )
228
231
  else:
232
+ use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states,
233
+ n_head, n_kv_head)
229
234
  key_states, value_states = update_past_key_value(
230
235
  past_key_value, key_states, value_states,
231
236
  kv_seq_len, use_quantize_kv, hidden_states.device
@@ -230,7 +230,7 @@ def chatglm4v_attention_forward(
230
230
  key_states[..., :rot_dim] = k_rot[...]
231
231
 
232
232
  # IPEX-LLM OPT: kv cache and quantize kv
233
- use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states)
233
+ use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states, n_head, n_kv_head)
234
234
  key_states, value_states = update_past_key_value(
235
235
  past_key_value, key_states, value_states,
236
236
  kv_seq_len, use_quantize_kv, hidden_states.device
@@ -157,8 +157,10 @@ def rms_norm_forward(self, hidden_states: torch.Tensor):
157
157
  weight = self.weight
158
158
  if hasattr(self, "variance_epsilon"):
159
159
  eps = self.variance_epsilon
160
- else:
160
+ elif hasattr(self, "epsilon"):
161
161
  eps = self.epsilon
162
+ else:
163
+ eps = self.eps
162
164
 
163
165
  if hidden_states.device.type == 'xpu' and hidden_states.dtype in [torch.float, torch.half]:
164
166
  import xe_addons
@@ -147,7 +147,7 @@ def glm_model_forward_wrapper(origin_forward):
147
147
  use_cache = use_cache if use_cache is not None else self.config.use_cache
148
148
  use_cache = use_cache or inputs.device.type == 'xpu'
149
149
  use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
150
- self.config.num_attention_heads //
150
+ self.config.num_attention_heads,
151
151
  self.config.num_key_value_heads)
152
152
 
153
153
  if use_cache:
@@ -87,7 +87,8 @@ def internlm_attention_forward(
87
87
  )
88
88
 
89
89
  # IPEX-LLM OPT: kv cache and quantzie kv cache
90
- use_quantize_kv = use_quantize_kv_cache(self.qkv_proj, hidden_states)
90
+ use_quantize_kv = use_quantize_kv_cache(self.qkv_proj, hidden_states,
91
+ self.num_heads, self.num_heads)
91
92
  key_states, value_states = update_past_key_value(
92
93
  past_key_value, key_states, value_states,
93
94
  kv_seq_len, use_quantize_kv, hidden_states.device
@@ -171,7 +172,8 @@ def internlm2_attention_forward(
171
172
  )
172
173
 
173
174
  # IPEX-LLM OPT: kv cache and quantzie kv cache
174
- use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states)
175
+ use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states,
176
+ self.num_heads, self.num_key_value_heads)
175
177
  key_states, value_states = update_past_key_value(
176
178
  past_key_value, key_states, value_states,
177
179
  kv_seq_len, use_quantize_kv, hidden_states.device
@@ -346,7 +348,8 @@ def internlm_xcomposser2_attention_forward(
346
348
  query_states, key_states, cos, sin, position_ids, "internlm")
347
349
 
348
350
  # IPEX-LLM OPT: kv cache and quantzie kv cache
349
- use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states)
351
+ use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states,
352
+ self.num_heads, self.num_key_value_heads)
350
353
  key_states, value_states = update_past_key_value(
351
354
  past_key_value, key_states, value_states,
352
355
  kv_seq_len, use_quantize_kv, device
@@ -72,7 +72,7 @@ def llama_model_forward(
72
72
  use_cache = True if inputs.device.type == "xpu" else use_cache
73
73
  use_quantize_kv = use_quantize_kv_cache(
74
74
  self.layers[0].mlp.down_proj, inputs,
75
- self.config.num_attention_heads // self.config.num_key_value_heads
75
+ self.config.num_attention_heads, self.config.num_key_value_heads
76
76
  )
77
77
  use_compresskv = should_use_compresskv(inputs, inputs.shape[1]) or \
78
78
  isinstance(past_key_values, DynamicCompressCache)
@@ -159,7 +159,7 @@ def minicpm_model_forward_wrapper(origin_forward):
159
159
  # IPEX-LLM OPT: kv cache and quantize kv cache
160
160
  inputs = input_ids if input_ids is not None else inputs_embeds
161
161
  use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
162
- self.config.num_attention_heads //
162
+ self.config.num_attention_heads,
163
163
  self.config.num_key_value_heads)
164
164
  use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
165
165
  isinstance(past_key_values, DynamicCompressCache)
@@ -66,7 +66,9 @@ def minicpm3_model_forward_wrapper(origin_forward):
66
66
  inputs = input_ids if input_ids is not None else inputs_embeds
67
67
  use_cache = use_cache if use_cache is not None else self.config.use_cache
68
68
  use_cache = True if inputs.device.type == "xpu" else use_cache
69
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
69
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
70
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
71
+ num_heads, num_kv_heads)
70
72
  if use_cache:
71
73
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
72
74
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -71,7 +71,7 @@ def mistral_model_forward(
71
71
  use_cache = use_cache if use_cache is not None else self.config.use_cache
72
72
  use_cache = use_cache or inputs.device.type == 'xpu'
73
73
  use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
74
- self.config.num_attention_heads //
74
+ self.config.num_attention_heads,
75
75
  self.config.num_key_value_heads)
76
76
  use_compress_kv = should_use_compresskv(inputs, inputs.size(1)) or \
77
77
  isinstance(past_key_values, DynamicCompressCache)
@@ -113,7 +113,7 @@ def mllama_text_model_forward(
113
113
  use_cache = True if inputs.device.type == "xpu" else use_cache
114
114
  use_quantize_kv = use_quantize_kv_cache(
115
115
  self.layers[0].mlp.down_proj, inputs,
116
- self.config.num_attention_heads // self.config.num_key_value_heads
116
+ self.config.num_attention_heads, self.config.num_key_value_heads
117
117
  )
118
118
  if use_cache:
119
119
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
@@ -249,7 +249,9 @@ def phi3_model_forward_wrapper(origin_model_forward):
249
249
  # IPEX-LLM OPT: kv cache and quantize kv cache and sdp
250
250
  use_cache = use_cache if use_cache is not None else self.config.use_cache
251
251
  inputs = input_ids if input_ids is not None else inputs_embeds
252
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
252
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
253
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
254
+ num_heads, num_kv_heads)
253
255
  use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
254
256
  isinstance(past_key_values, DynamicCompressCache)
255
257
  if use_cache:
@@ -305,7 +307,9 @@ def phi3v_model_forward_wrapper(origin_model_forward):
305
307
  ):
306
308
  # IPEX-LLM OPT: kv cache and quantize kv cache and sdp
307
309
  use_cache = use_cache if use_cache is not None else self.config.use_cache
308
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, input_ids)
310
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
311
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, input_ids,
312
+ num_heads, num_kv_heads)
309
313
  if use_cache:
310
314
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
311
315
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -107,7 +107,8 @@ def qwen_attention_forward(
107
107
  query_states = query_states * logn_tensor.type_as(query_states).expand_as(query_states)
108
108
 
109
109
  # IPEX-LLM OPT: kv cache and quantzie kv cache
110
- use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states)
110
+ use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states,
111
+ self.num_heads, self.num_heads)
111
112
  key_states, value_states = update_past_key_value(
112
113
  past_key_value, key_states, value_states,
113
114
  kv_seq_len, use_quantize_kv, device
@@ -205,7 +206,8 @@ def qwen_attention_forward_registered(
205
206
  query_states = query_states * logn_tensor.type_as(query_states).expand_as(query_states)
206
207
 
207
208
  # IPEX-LLM OPT: kv cache and quantzie kv cache
208
- use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states)
209
+ use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states,
210
+ self.num_heads, self.num_heads)
209
211
  key_states, value_states = update_past_key_value(
210
212
  past_key_value, key_states, value_states,
211
213
  kv_seq_len, use_quantize_kv, device
@@ -113,10 +113,10 @@ def qwen2_model_forward(
113
113
  # ipex-llm changes start
114
114
  # IPEX-LLM OPT: kv cache and quantize kv cache
115
115
  inputs = input_ids if input_ids is not None else inputs_embeds
116
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
116
117
  use_quantize_kv = (
117
118
  self.config.hidden_size != 3584 # disable quantize kv in specific model
118
- and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
119
- self.config.num_attention_heads//self.config.num_key_value_heads)
119
+ and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs, num_heads, num_kv_heads)
120
120
  )
121
121
  use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
122
122
  isinstance(past_key_values, DynamicCompressCache)
@@ -305,10 +305,11 @@ def qwen2_model_forward_4_42(
305
305
 
306
306
  # ipex-llm changes start
307
307
  # IPEX-LLM OPT: kv cache and quantize kv cache
308
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
308
309
  use_quantize_kv = (
309
310
  self.config.hidden_size != 3584 # disable quantize kv in specific model
310
311
  and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs_embeds,
311
- self.config.num_attention_heads//self.config.num_key_value_heads)
312
+ num_heads, num_kv_heads)
312
313
  )
313
314
  use_compress_kv = should_use_compresskv(inputs_embeds, inputs_embeds.shape[1]) or \
314
315
  isinstance(past_key_values, DynamicCompressCache)
@@ -73,8 +73,10 @@ def qwen2moe_model_forward(
73
73
  return_dict: Optional[bool] = None,
74
74
  ):
75
75
  use_cache = use_cache if use_cache is not None else self.config.use_cache
76
- input = input_ids if input_ids is not None else inputs_embeds
77
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, input)
76
+ inputs = input_ids if input_ids is not None else inputs_embeds
77
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
78
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, inputs,
79
+ num_heads, num_kv_heads)
78
80
  if use_cache:
79
81
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
80
82
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -88,7 +88,9 @@ def qwen2_vl_model_forward(
88
88
  # IPEX-LLM OPT start: kv cache and quantize kv cache
89
89
  inputs = input_ids if input_ids is not None else inputs_embeds
90
90
  use_cache = True if inputs.device.type == "xpu" else use_cache
91
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
91
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
92
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
93
+ num_heads, num_kv_heads)
92
94
  if use_cache:
93
95
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
94
96
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -69,8 +69,10 @@ def stablelm_model_forward(
69
69
  ):
70
70
  # IPEX-LLM OPT: kv cache and quantize kv cache
71
71
  use_cache = use_cache if use_cache is not None else self.config.use_cache
72
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
72
73
  use_quantize_kv = (self.layers[0].self_attn.head_dim in [64, 80, 96, 128]
73
- and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids))
74
+ and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids,
75
+ num_heads, num_kv_heads))
74
76
  if use_cache:
75
77
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
76
78
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -132,7 +132,9 @@ def model_forward(
132
132
  return_dict: Optional[bool] = None,
133
133
  ):
134
134
  use_cache = use_cache if use_cache is not None else self.config.use_cache
135
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.c_fc, input_ids)
135
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
136
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.c_fc, input_ids,
137
+ num_heads, num_kv_heads)
136
138
  if use_cache:
137
139
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
138
140
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -74,7 +74,8 @@ def append_kv_cache(cache_k, cache_v, key_states, value_states):
74
74
  return new_cache_k, new_cache_v
75
75
 
76
76
 
77
- def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: int = 1) -> bool:
77
+ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
78
+ num_heads: int, num_kv_heads: int) -> bool:
78
79
  if os.environ.get("BIGDL_QUANTIZE_KV_CACHE", None) is not None:
79
80
  warnings.warn(
80
81
  "`BIGDL_QUANTIZE_KV_CACHE` is deprecated and will be removed in future releases. "
@@ -90,8 +91,11 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: in
90
91
  else:
91
92
  device_name = get_xpu_device_name(x.device)
92
93
  return (
93
- device_name in ["mtl", "lnl", "arl"] and kv_group == 1
94
- or device_name in ["arc", "bmg"] and x.size(0) > 1
94
+ num_kv_heads >= 4
95
+ and (
96
+ device_name in ["mtl", "lnl", "arl"] and num_heads // num_kv_heads <= 4
97
+ or device_name in ["arc", "bmg"] and x.size(0) > 1
98
+ )
95
99
  )
96
100
 
97
101
 
@@ -158,7 +158,8 @@ def yuan_attention_forward(
158
158
  "yuan")
159
159
 
160
160
  # IPEX-LLM OPT: kv cache and quantzie kv cache
161
- use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states)
161
+ use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states,
162
+ self.num_heads, self.num_heads)
162
163
  key_states, value_states = update_past_key_value(
163
164
  None if past_key_value is None else (past_key_value[0], past_key_value[1]),
164
165
  key_states, value_states,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ipex-llm
3
- Version: 2.2.0b20250108
3
+ Version: 2.2.0b20250109
4
4
  Summary: Large Language Model Develop Toolkit
5
5
  Home-page: https://github.com/intel-analytics/ipex-llm
6
6
  Author: BigDL Authors
@@ -27,10 +27,10 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
27
27
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
28
28
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
29
29
  Provides-Extra: cpp
30
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250108 ; extra == 'cpp'
30
+ Requires-Dist: bigdl-core-cpp ==2.6.0b20250109 ; extra == 'cpp'
31
31
  Requires-Dist: setuptools ; extra == 'cpp'
32
32
  Provides-Extra: cpp-arl
33
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250108 ; extra == 'cpp-arl'
33
+ Requires-Dist: bigdl-core-cpp ==2.6.0b20250109 ; extra == 'cpp-arl'
34
34
  Requires-Dist: setuptools ; extra == 'cpp-arl'
35
35
  Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
36
36
  Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
@@ -67,7 +67,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
67
67
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
68
68
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
69
69
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
70
- Requires-Dist: bigdl-core-npu ==2.6.0b20250108 ; (platform_system == "Windows") and extra == 'npu'
70
+ Requires-Dist: bigdl-core-npu ==2.6.0b20250109 ; (platform_system == "Windows") and extra == 'npu'
71
71
  Provides-Extra: serving
72
72
  Requires-Dist: py-cpuinfo ; extra == 'serving'
73
73
  Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -87,9 +87,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
87
87
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
88
88
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
89
89
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
90
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250108 ; extra == 'xpu'
91
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250108 ; extra == 'xpu'
92
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250108 ; extra == 'xpu'
90
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250109 ; extra == 'xpu'
91
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250109 ; extra == 'xpu'
92
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250109 ; extra == 'xpu'
93
93
  Provides-Extra: xpu-2-1
94
94
  Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
95
95
  Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -104,9 +104,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
104
104
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
105
105
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
106
106
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
107
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250108 ; extra == 'xpu-2-1'
108
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250108 ; extra == 'xpu-2-1'
109
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250108 ; extra == 'xpu-2-1'
107
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
108
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
109
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
110
110
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
111
111
  Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
112
112
  Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -124,7 +124,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
124
124
  Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
125
125
  Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
126
126
  Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
127
- Requires-Dist: bigdl-core-xe-all ==2.6.0b20250108 ; extra == 'xpu-2-6'
127
+ Requires-Dist: bigdl-core-xe-all ==2.6.0b20250109 ; extra == 'xpu-2-6'
128
128
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-6'
129
129
  Provides-Extra: xpu-arc
130
130
  Requires-Dist: py-cpuinfo ; extra == 'xpu-arc'
@@ -137,9 +137,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
137
137
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
138
138
  Requires-Dist: tabulate ; extra == 'xpu-arc'
139
139
  Requires-Dist: setuptools ; extra == 'xpu-arc'
140
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250108 ; extra == 'xpu-arc'
141
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250108 ; extra == 'xpu-arc'
142
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250108 ; extra == 'xpu-arc'
140
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
141
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
142
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
143
143
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
144
144
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
145
145
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -160,9 +160,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
160
160
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
161
161
  Requires-Dist: tabulate ; extra == 'xpu-arl'
162
162
  Requires-Dist: setuptools ; extra == 'xpu-arl'
163
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250108 ; extra == 'xpu-arl'
164
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250108 ; extra == 'xpu-arl'
165
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250108 ; extra == 'xpu-arl'
163
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
164
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
165
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
166
166
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
167
167
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
168
168
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -183,9 +183,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
183
183
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
184
184
  Requires-Dist: tabulate ; extra == 'xpu-lnl'
185
185
  Requires-Dist: setuptools ; extra == 'xpu-lnl'
186
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250108 ; extra == 'xpu-lnl'
187
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250108 ; extra == 'xpu-lnl'
188
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250108 ; extra == 'xpu-lnl'
186
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
187
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
188
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
189
189
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
190
190
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
191
191
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
@@ -94,7 +94,7 @@ ipex_llm/serving/fastchat/tgi_api_protocol.py,sha256=brT3k3-V0NJrU4fRqUwWjC0O3iO
94
94
  ipex_llm/serving/fastchat/tgi_api_server.py,sha256=agNTAEiZPSuj3dEdIdYKwkoY0cXOUDX06DiM9VP2knQ,24418
95
95
  ipex_llm/serving/fastchat/vllm_worker.py,sha256=ZLz2Q9GxJO6r_LOiP6epgCRjBGk-K4EB1SNEWSJp5DA,11091
96
96
  ipex_llm/transformers/__init__.py,sha256=l4KkMkLe-pRC7b_kj6LCfeifgE-Uo33_Av_FwN9HnFA,1074
97
- ipex_llm/transformers/convert.py,sha256=APf2uHMgEeiAhsKm9dPgPWlyO0ADq2yHtZgovv9oczU,99101
97
+ ipex_llm/transformers/convert.py,sha256=umI137wqV2d4itS0AJQoZcygeWBATpSJSDJ805cZ-SY,98499
98
98
  ipex_llm/transformers/convert_ipex.py,sha256=iKXo0n8fVFTOA2fNYYrByMFK0dovL-kLd2sVDk88AlQ,14334
99
99
  ipex_llm/transformers/embedding.py,sha256=bdgk59DvD4ZZyxRzewXOR7g56nThgO6uhIwk8QL7f-s,9299
100
100
  ipex_llm/transformers/kv.py,sha256=k4TU18LlA-Sbq9WNNQnfuzu3RSFBwFhmaV3BcGN5bAo,19191
@@ -144,45 +144,45 @@ ipex_llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py,sh
144
144
  ipex_llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py,sha256=_AOGMV65XHxgTxIib7lgs49InopcecTzRwgtYR8NTUg,51084
145
145
  ipex_llm/transformers/models/__init__.py,sha256=tp2DcVkKg1-QvdYk7DY7rZvQWCDQ4ZjU8NAQ7Fclrpg,584
146
146
  ipex_llm/transformers/models/aquila.py,sha256=VZb5Drpo_fTxwcExZ397LygnsNPX2sVbie9_JeFudZI,5252
147
- ipex_llm/transformers/models/baichuan.py,sha256=oJCAEENSG8oQhJ-QPN2SiapARjAGdOM6nEbyCcYOMCo,19334
148
- ipex_llm/transformers/models/bert.py,sha256=bJNic2pt1kph0kBwdK5MRGyWupFfx2Ts0V3D1L-5kWo,6085
147
+ ipex_llm/transformers/models/baichuan.py,sha256=cAQLmVG-3R8CSTGTcDy2JOOzVe-Ej8AXjIEIjvZBGlo,18376
148
+ ipex_llm/transformers/models/bert.py,sha256=0Mm9jkvkzBxtc_z_GE1TcZoPz-HOg2Z2973ZEWgSwJk,5601
149
149
  ipex_llm/transformers/models/bloom.py,sha256=PxfzyYT-nFn3K5rZhTQjmcEjUUzAhUFzxIN4kzRlCuc,8103
150
150
  ipex_llm/transformers/models/chatglm.py,sha256=UHai1t2AUtGmF765_eHF8LUMVQzp_oCBx8TJB21WrHk,12597
151
- ipex_llm/transformers/models/chatglm2.py,sha256=SGCABJdYQLW0zDarEoWrEQLuWlbq9iQhYU8ZeR1-ptQ,15957
152
- ipex_llm/transformers/models/chatglm4.py,sha256=AAhAFFDDas5DBQPfh2Mwl7a2v7taKf6xphoeeNNFaBI,16593
153
- ipex_llm/transformers/models/chatglm4v.py,sha256=tyjDDyF6FEgLAT24EG3i4-auxZvkwmeLIy0Hds4K5Yo,14105
154
- ipex_llm/transformers/models/common.py,sha256=4obQMGF02FCiXrHnFle9Fsx7C33b1FDt37qJJ4YgxRc,11578
151
+ ipex_llm/transformers/models/chatglm2.py,sha256=KyAIX7zGVQDQuwwM3QMBNWZbTeMHEzKUIgAryT0voHc,14933
152
+ ipex_llm/transformers/models/chatglm4.py,sha256=QvUehdaCePB3MNHyWg3dneDxmjtBdxYeKUyQUVcsgfM,16886
153
+ ipex_llm/transformers/models/chatglm4v.py,sha256=L6y45M_wjS2_HqchmCUxRlQZUNuSNCGOiynAQrGh918,14124
154
+ ipex_llm/transformers/models/common.py,sha256=Q3IEfGqvxoHyfIIF5s8qHmOJBBP3b2jyVAXk8C3b1Pg,11636
155
155
  ipex_llm/transformers/models/decilm.py,sha256=P-PBuDPf07GvKggLwJx_wPwIn6esN3rX8ai2JxRuZmE,5246
156
156
  ipex_llm/transformers/models/gemma.py,sha256=_E3Yw8Y45xyNVeLqyVKcpr8kjuICtETeL82cJ-bWJuU,9424
157
157
  ipex_llm/transformers/models/gemma2.py,sha256=2WZuv-FLzJyTJFaYxOuzJt47QE64M0lHnzAiO5T6ozI,8049
158
- ipex_llm/transformers/models/glm.py,sha256=PE43uks9lojndBBHFVXK1VWisHhbY-kuCmhq0CwmD4s,7204
158
+ ipex_llm/transformers/models/glm.py,sha256=lmeEWd_W2O638VzVW4Gm6cJre5XZcg_QBmPs8NWqXsM,7202
159
159
  ipex_llm/transformers/models/gpt2.py,sha256=YSaNgK1uLCFDuIFqnKO0Mi-AsOZsYav-7pNf_NpKGdM,3445
160
160
  ipex_llm/transformers/models/gptbigcode.py,sha256=cP1_qGWoa43R2WacAMblShjku4QupcCZiLaPPAoOUs4,9101
161
161
  ipex_llm/transformers/models/gptneox.py,sha256=loRh1x_5S6BCeOr_s5xr-N_1SQHL3Y5IiUBAEyoMUqQ,6172
162
- ipex_llm/transformers/models/internlm.py,sha256=ZbIUMDwNRcrCeduXfbA_uq1AUEWawEt6CJRvQl3LkAg,17832
162
+ ipex_llm/transformers/models/internlm.py,sha256=OifyiobRligleyZLpLBSe44A6Sq0uMG-8-NOcRCcT4Q,18080
163
163
  ipex_llm/transformers/models/internvl.py,sha256=Vx0vENIEQLX2M6P398mw5TOhpks0U8xf8rtRQvy94go,8154
164
- ipex_llm/transformers/models/llama.py,sha256=n1JG1uElMB8t3Hpae94S6YTO_5q2N5BUAhb7mncvA6E,8560
165
- ipex_llm/transformers/models/minicpm.py,sha256=_eYBYafQxnuqKo9ENNkua73KU5goU2z-dkaLlF5uHnA,10147
166
- ipex_llm/transformers/models/minicpm3.py,sha256=FhNS6mi2rg7dSdF_QQGrao3g9EC6XLn1MTKd-kd0wF0,9191
164
+ ipex_llm/transformers/models/llama.py,sha256=NzpyQve_RC9ez1W-jWPLGZ80k_S1I5Rx5saAzCsDIoI,8558
165
+ ipex_llm/transformers/models/minicpm.py,sha256=eaPNVNrep0_xGoELhZd886ff0ceoKqB6cusdAhd52eE,10145
166
+ ipex_llm/transformers/models/minicpm3.py,sha256=11cYl8KM2hoIJNMAOZMxiwCu6dMhup9ric_OEn8-VrQ,9363
167
167
  ipex_llm/transformers/models/minicpmv.py,sha256=PP05b5iTnrMpiseCn8iJcxKJDnfq7WqXp9Mrch0kKZ0,9876
168
- ipex_llm/transformers/models/mistral.py,sha256=rE1GWQxXvF6aG-buPHDR13zeynDZEDIubPF4PiVhZbM,7451
169
- ipex_llm/transformers/models/mllama.py,sha256=ogpLmmN_OwcFUyjYB-oDC-l3uw8urFvUEc5edkjWHAk,10939
168
+ ipex_llm/transformers/models/mistral.py,sha256=uVhkdXaq15v1P3QY0emVsA7SxUbAWChHEEXYN-drjpQ,7449
169
+ ipex_llm/transformers/models/mllama.py,sha256=ZyRq9DTKsvk1AlRbr-z6ngjS3Sr_7YuGZ6-Yr1MBBAM,10937
170
170
  ipex_llm/transformers/models/mpt.py,sha256=z02NwHogJZVh-Mk4sYoIzR90SFIKhoNN_-ifsD907TQ,9540
171
171
  ipex_llm/transformers/models/phi.py,sha256=E6qz4EEuHIVGvaPo-wtLC5lz3iyMqTbAE_cRlcjQRKI,6670
172
- ipex_llm/transformers/models/phi3.py,sha256=jkiadJ85ToHpymY5GOM6orWlnx6LKN8_-v1MUcfGWPg,15159
172
+ ipex_llm/transformers/models/phi3.py,sha256=Fo6PlZ24Gdm7eeeZOTMm1Bfh3U6P4rvq7-_2FHvp0vE,15503
173
173
  ipex_llm/transformers/models/phixtral.py,sha256=MDTMghcu7qAmZmRcUGqXXDXhSU3y_N59HRIXmlcjp5g,4890
174
- ipex_llm/transformers/models/qwen.py,sha256=XIJ_bLzediBURWU-OOS3H6WBIGXQue6jDdUHJsAabwY,19391
175
- ipex_llm/transformers/models/qwen2.py,sha256=b49HO4GSudwGJ3n6uHVno1oo3DgRt3jOjtQnLOB3cdY,25530
176
- ipex_llm/transformers/models/qwen2_moe.py,sha256=EA_OYxYAEgrvi7VpDW192AJXG9Fwe2aBtOAZPkOAJk4,19350
177
- ipex_llm/transformers/models/qwen2_vl.py,sha256=jIm4yZSd751BkRqgj3wR1QBkDIh-TMCLAMM8SZ8n6Qo,13419
174
+ ipex_llm/transformers/models/qwen.py,sha256=A3WiVCzA7NLkcjp4zhFkZvKZzZWZlg0WFuVV_556TAI,19543
175
+ ipex_llm/transformers/models/qwen2.py,sha256=JLaY9ZT7A22oO0G8K-nvjvKQDaIrKA5o-jEHvk_y3eI,25604
176
+ ipex_llm/transformers/models/qwen2_moe.py,sha256=a0gYo-ngf8SxaEnBdZUJDnPS6Mkn_poDd8xqhx50icI,19516
177
+ ipex_llm/transformers/models/qwen2_vl.py,sha256=NrhxlaPj7W-HUBmKc3CSTwZy1lkoZ9qDaxM4GvE0kHs,13583
178
178
  ipex_llm/transformers/models/qwen_vl.py,sha256=j7Nzzz2Qvynu9yrCXmoEfERjw43hXof5TbXIs7Ms-oY,17105
179
179
  ipex_llm/transformers/models/rwkv4.py,sha256=H4KMtxN0JA2ZTXnonHpsUUJ5xULemo-D1Jzl0ri_UY8,6123
180
180
  ipex_llm/transformers/models/rwkv5.py,sha256=OkRNj1pCAZg1z2Fw-I0DEnxLEdZyPeRSQ6msrkxLOCs,10710
181
181
  ipex_llm/transformers/models/sd.py,sha256=VvHV5u-0k2MgHu3NL9113hPj7DgfxqctuKzEEeNfRDU,5981
182
- ipex_llm/transformers/models/stablelm.py,sha256=RGQCYuQhYqtZ1j3RZkYi0_QvCRnUgUIPYxfBcLnElzg,6885
183
- ipex_llm/transformers/models/starcoder2.py,sha256=4P3mhRYf2Kreb1ESjrQGfy1puLMmZXgV35zf-Tksvao,6462
184
- ipex_llm/transformers/models/utils.py,sha256=isBCMMQP3j_opmda9XzD_dPk1ejvEXTztggbu1yIMSc,15439
185
- ipex_llm/transformers/models/yuan.py,sha256=1jRPebwAK2ENbyYokOmb4LSVo-szucWiygz9zTv-scs,7656
182
+ ipex_llm/transformers/models/stablelm.py,sha256=fj-XtOnR6kggnFUQTMPCOOzolkPztN06WAv8QW-XRnI,7054
183
+ ipex_llm/transformers/models/starcoder2.py,sha256=ONKvD7JCkRM0DI-R56x28QFBJ7CjD5hOZBQ_3WfOcNk,6626
184
+ ipex_llm/transformers/models/utils.py,sha256=ihbWS5kQK2KHDVPkMhgjik3nM8B2fWf-E-z4BWNUstk,15568
185
+ ipex_llm/transformers/models/yuan.py,sha256=JYAn_ZaSGK0NBJLEIxCACfAq084a66GFJkdd5NbpmMA,7732
186
186
  ipex_llm/transformers/npu_models/__init__.py,sha256=ulEUGLjaP48LCrVeury3UxLjXxKzRi0UpSG4bYu-7f8,585
187
187
  ipex_llm/transformers/npu_models/baichuan.py,sha256=fJtd7fBrttySghRUgfZTAdxLjsSNC-XL08HISsXigLE,4685
188
188
  ipex_llm/transformers/npu_models/baichuan_mp.py,sha256=tHhO-0v5z6IhxsfzAPYWXVbLrV_4z89DIb4JjE3207M,45026
@@ -250,11 +250,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
250
250
  ipex_llm/vllm/xpu/engine/engine.py,sha256=k4-D27WS_Gk3mA--w3HWAjPjb4Aiu043MVPi0ZoAUBc,5984
251
251
  ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=GshTZFB8e4PWvqckfbmTOU6b0oLkNn7A-vzLuG9--j8,21544
252
252
  ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=2rENA2ucynMaIjiZBEh2ez1o5vR32GaP514t39CD7KM,8676
253
- ipex_llm-2.2.0b20250108.data/scripts/ipex-llm-init,sha256=fLQsT2dRL6H5bThb4GuIWotAuqoLsIxFwA-0c2qmaO8,6672
254
- ipex_llm-2.2.0b20250108.data/scripts/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
255
- ipex_llm-2.2.0b20250108.data/scripts/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
256
- ipex_llm-2.2.0b20250108.dist-info/METADATA,sha256=NJp_uuPOJe8b5UQ8ASJbfzen2BGoc2DEM1ZInzr0X9E,12705
257
- ipex_llm-2.2.0b20250108.dist-info/WHEEL,sha256=PPJcBMAZibF_2GFE9NmOJGqiaSMPiNFbJd6QaJjdA6Y,109
258
- ipex_llm-2.2.0b20250108.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
259
- ipex_llm-2.2.0b20250108.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
260
- ipex_llm-2.2.0b20250108.dist-info/RECORD,,
253
+ ipex_llm-2.2.0b20250109.data/scripts/ipex-llm-init,sha256=fLQsT2dRL6H5bThb4GuIWotAuqoLsIxFwA-0c2qmaO8,6672
254
+ ipex_llm-2.2.0b20250109.data/scripts/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
255
+ ipex_llm-2.2.0b20250109.data/scripts/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
256
+ ipex_llm-2.2.0b20250109.dist-info/METADATA,sha256=gPslIWSw_X5E5ULhQa8rOHeRo_UeBDXCAyPjBSPB-nU,12705
257
+ ipex_llm-2.2.0b20250109.dist-info/WHEEL,sha256=PPJcBMAZibF_2GFE9NmOJGqiaSMPiNFbJd6QaJjdA6Y,109
258
+ ipex_llm-2.2.0b20250109.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
259
+ ipex_llm-2.2.0b20250109.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
260
+ ipex_llm-2.2.0b20250109.dist-info/RECORD,,