ipex-llm 2.2.0b20250107__py3-none-win_amd64.whl → 2.2.0b20250109__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/convert.py +20 -50
  31. ipex_llm/transformers/loader.py +1 -1
  32. ipex_llm/transformers/low_bit_linear.py +10 -25
  33. ipex_llm/transformers/model.py +0 -7
  34. ipex_llm/transformers/models/baichuan.py +7 -36
  35. ipex_llm/transformers/models/bert.py +2 -13
  36. ipex_llm/transformers/models/chatglm2.py +8 -31
  37. ipex_llm/transformers/models/chatglm4.py +9 -4
  38. ipex_llm/transformers/models/chatglm4v.py +2 -1
  39. ipex_llm/transformers/models/common.py +3 -1
  40. ipex_llm/transformers/models/glm.py +4 -2
  41. ipex_llm/transformers/models/internlm.py +6 -3
  42. ipex_llm/transformers/models/llama.py +2 -2
  43. ipex_llm/transformers/models/minicpm.py +3 -2
  44. ipex_llm/transformers/models/minicpm3.py +3 -1
  45. ipex_llm/transformers/models/minicpmv.py +1 -0
  46. ipex_llm/transformers/models/mistral.py +1 -1
  47. ipex_llm/transformers/models/mllama.py +1 -1
  48. ipex_llm/transformers/models/phi3.py +6 -2
  49. ipex_llm/transformers/models/qwen.py +4 -2
  50. ipex_llm/transformers/models/qwen2.py +4 -3
  51. ipex_llm/transformers/models/qwen2_moe.py +4 -2
  52. ipex_llm/transformers/models/qwen2_vl.py +3 -1
  53. ipex_llm/transformers/models/stablelm.py +3 -1
  54. ipex_llm/transformers/models/starcoder2.py +3 -1
  55. ipex_llm/transformers/models/utils.py +10 -19
  56. ipex_llm/transformers/models/yuan.py +2 -1
  57. ipex_llm/transformers/speculative.py +2 -14
  58. ipex_llm/transformers/utils.py +2 -14
  59. ipex_llm/transformers/xpu_ops.py +25 -19
  60. {ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/METADATA +20 -20
  61. {ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/RECORD +67 -68
  62. ipex_llm/transformers/models/gptj.py +0 -441
  63. {ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250109.data}/scripts/ipex-llm-init.bat +0 -0
  64. {ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-chat.ps1 +0 -0
  65. {ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-cli.ps1 +0 -0
  66. {ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/WHEEL +0 -0
  67. {ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/entry_points.txt +0 -0
  68. {ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/top_level.txt +0 -0
@@ -87,7 +87,8 @@ def internlm_attention_forward(
87
87
  )
88
88
 
89
89
  # IPEX-LLM OPT: kv cache and quantzie kv cache
90
- use_quantize_kv = use_quantize_kv_cache(self.qkv_proj, hidden_states)
90
+ use_quantize_kv = use_quantize_kv_cache(self.qkv_proj, hidden_states,
91
+ self.num_heads, self.num_heads)
91
92
  key_states, value_states = update_past_key_value(
92
93
  past_key_value, key_states, value_states,
93
94
  kv_seq_len, use_quantize_kv, hidden_states.device
@@ -171,7 +172,8 @@ def internlm2_attention_forward(
171
172
  )
172
173
 
173
174
  # IPEX-LLM OPT: kv cache and quantzie kv cache
174
- use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states)
175
+ use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states,
176
+ self.num_heads, self.num_key_value_heads)
175
177
  key_states, value_states = update_past_key_value(
176
178
  past_key_value, key_states, value_states,
177
179
  kv_seq_len, use_quantize_kv, hidden_states.device
@@ -346,7 +348,8 @@ def internlm_xcomposser2_attention_forward(
346
348
  query_states, key_states, cos, sin, position_ids, "internlm")
347
349
 
348
350
  # IPEX-LLM OPT: kv cache and quantzie kv cache
349
- use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states)
351
+ use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states,
352
+ self.num_heads, self.num_key_value_heads)
350
353
  key_states, value_states = update_past_key_value(
351
354
  past_key_value, key_states, value_states,
352
355
  kv_seq_len, use_quantize_kv, device
@@ -72,7 +72,7 @@ def llama_model_forward(
72
72
  use_cache = True if inputs.device.type == "xpu" else use_cache
73
73
  use_quantize_kv = use_quantize_kv_cache(
74
74
  self.layers[0].mlp.down_proj, inputs,
75
- self.config.num_attention_heads // self.config.num_key_value_heads
75
+ self.config.num_attention_heads, self.config.num_key_value_heads
76
76
  )
77
77
  use_compresskv = should_use_compresskv(inputs, inputs.shape[1]) or \
78
78
  isinstance(past_key_values, DynamicCompressCache)
@@ -116,7 +116,7 @@ def llama_model_forward(
116
116
 
117
117
 
118
118
  def merge_qkv(module: torch.nn.Module):
119
- return merge_qkv_base(module, LlamaAttention)
119
+ merge_qkv_base(module, LlamaAttention)
120
120
 
121
121
 
122
122
  def llama_attention_forward(
@@ -51,7 +51,8 @@ from transformers.cache_utils import Cache
51
51
 
52
52
 
53
53
  def merge_qkv(module: torch.nn.Module):
54
- return merge_qkv_base(module, "MiniCPMAttention")
54
+ merge_qkv_base(module, "MiniCPMAttention")
55
+ merge_qkv_base(module, "MiniCPMSdpaAttention")
55
56
 
56
57
 
57
58
  def apply_residual_scale(module: torch.nn.Module):
@@ -158,7 +159,7 @@ def minicpm_model_forward_wrapper(origin_forward):
158
159
  # IPEX-LLM OPT: kv cache and quantize kv cache
159
160
  inputs = input_ids if input_ids is not None else inputs_embeds
160
161
  use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
161
- self.config.num_attention_heads //
162
+ self.config.num_attention_heads,
162
163
  self.config.num_key_value_heads)
163
164
  use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
164
165
  isinstance(past_key_values, DynamicCompressCache)
@@ -66,7 +66,9 @@ def minicpm3_model_forward_wrapper(origin_forward):
66
66
  inputs = input_ids if input_ids is not None else inputs_embeds
67
67
  use_cache = use_cache if use_cache is not None else self.config.use_cache
68
68
  use_cache = True if inputs.device.type == "xpu" else use_cache
69
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
69
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
70
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
71
+ num_heads, num_kv_heads)
70
72
  if use_cache:
71
73
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
72
74
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -36,6 +36,7 @@ from transformers.generation.logits_process import RepetitionPenaltyLogitsProces
36
36
  # MiniCPM-V-2_5 and MiniCPM-V-2_6
37
37
  def merge_qkv(module: torch.nn.Module):
38
38
  merge_qkv_base(module, "SiglipAttention")
39
+ merge_qkv_base(module, "SiglipSdpaAttention")
39
40
  merge_qkv_base(module, "Idefics2VisionAttention")
40
41
 
41
42
 
@@ -71,7 +71,7 @@ def mistral_model_forward(
71
71
  use_cache = use_cache if use_cache is not None else self.config.use_cache
72
72
  use_cache = use_cache or inputs.device.type == 'xpu'
73
73
  use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
74
- self.config.num_attention_heads //
74
+ self.config.num_attention_heads,
75
75
  self.config.num_key_value_heads)
76
76
  use_compress_kv = should_use_compresskv(inputs, inputs.size(1)) or \
77
77
  isinstance(past_key_values, DynamicCompressCache)
@@ -113,7 +113,7 @@ def mllama_text_model_forward(
113
113
  use_cache = True if inputs.device.type == "xpu" else use_cache
114
114
  use_quantize_kv = use_quantize_kv_cache(
115
115
  self.layers[0].mlp.down_proj, inputs,
116
- self.config.num_attention_heads // self.config.num_key_value_heads
116
+ self.config.num_attention_heads, self.config.num_key_value_heads
117
117
  )
118
118
  if use_cache:
119
119
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
@@ -249,7 +249,9 @@ def phi3_model_forward_wrapper(origin_model_forward):
249
249
  # IPEX-LLM OPT: kv cache and quantize kv cache and sdp
250
250
  use_cache = use_cache if use_cache is not None else self.config.use_cache
251
251
  inputs = input_ids if input_ids is not None else inputs_embeds
252
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
252
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
253
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
254
+ num_heads, num_kv_heads)
253
255
  use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
254
256
  isinstance(past_key_values, DynamicCompressCache)
255
257
  if use_cache:
@@ -305,7 +307,9 @@ def phi3v_model_forward_wrapper(origin_model_forward):
305
307
  ):
306
308
  # IPEX-LLM OPT: kv cache and quantize kv cache and sdp
307
309
  use_cache = use_cache if use_cache is not None else self.config.use_cache
308
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, input_ids)
310
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
311
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, input_ids,
312
+ num_heads, num_kv_heads)
309
313
  if use_cache:
310
314
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
311
315
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -107,7 +107,8 @@ def qwen_attention_forward(
107
107
  query_states = query_states * logn_tensor.type_as(query_states).expand_as(query_states)
108
108
 
109
109
  # IPEX-LLM OPT: kv cache and quantzie kv cache
110
- use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states)
110
+ use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states,
111
+ self.num_heads, self.num_heads)
111
112
  key_states, value_states = update_past_key_value(
112
113
  past_key_value, key_states, value_states,
113
114
  kv_seq_len, use_quantize_kv, device
@@ -205,7 +206,8 @@ def qwen_attention_forward_registered(
205
206
  query_states = query_states * logn_tensor.type_as(query_states).expand_as(query_states)
206
207
 
207
208
  # IPEX-LLM OPT: kv cache and quantzie kv cache
208
- use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states)
209
+ use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states,
210
+ self.num_heads, self.num_heads)
209
211
  key_states, value_states = update_past_key_value(
210
212
  past_key_value, key_states, value_states,
211
213
  kv_seq_len, use_quantize_kv, device
@@ -113,10 +113,10 @@ def qwen2_model_forward(
113
113
  # ipex-llm changes start
114
114
  # IPEX-LLM OPT: kv cache and quantize kv cache
115
115
  inputs = input_ids if input_ids is not None else inputs_embeds
116
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
116
117
  use_quantize_kv = (
117
118
  self.config.hidden_size != 3584 # disable quantize kv in specific model
118
- and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
119
- self.config.num_attention_heads//self.config.num_key_value_heads)
119
+ and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs, num_heads, num_kv_heads)
120
120
  )
121
121
  use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
122
122
  isinstance(past_key_values, DynamicCompressCache)
@@ -305,10 +305,11 @@ def qwen2_model_forward_4_42(
305
305
 
306
306
  # ipex-llm changes start
307
307
  # IPEX-LLM OPT: kv cache and quantize kv cache
308
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
308
309
  use_quantize_kv = (
309
310
  self.config.hidden_size != 3584 # disable quantize kv in specific model
310
311
  and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs_embeds,
311
- self.config.num_attention_heads//self.config.num_key_value_heads)
312
+ num_heads, num_kv_heads)
312
313
  )
313
314
  use_compress_kv = should_use_compresskv(inputs_embeds, inputs_embeds.shape[1]) or \
314
315
  isinstance(past_key_values, DynamicCompressCache)
@@ -73,8 +73,10 @@ def qwen2moe_model_forward(
73
73
  return_dict: Optional[bool] = None,
74
74
  ):
75
75
  use_cache = use_cache if use_cache is not None else self.config.use_cache
76
- input = input_ids if input_ids is not None else inputs_embeds
77
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, input)
76
+ inputs = input_ids if input_ids is not None else inputs_embeds
77
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
78
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, inputs,
79
+ num_heads, num_kv_heads)
78
80
  if use_cache:
79
81
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
80
82
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -88,7 +88,9 @@ def qwen2_vl_model_forward(
88
88
  # IPEX-LLM OPT start: kv cache and quantize kv cache
89
89
  inputs = input_ids if input_ids is not None else inputs_embeds
90
90
  use_cache = True if inputs.device.type == "xpu" else use_cache
91
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
91
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
92
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
93
+ num_heads, num_kv_heads)
92
94
  if use_cache:
93
95
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
94
96
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -69,8 +69,10 @@ def stablelm_model_forward(
69
69
  ):
70
70
  # IPEX-LLM OPT: kv cache and quantize kv cache
71
71
  use_cache = use_cache if use_cache is not None else self.config.use_cache
72
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
72
73
  use_quantize_kv = (self.layers[0].self_attn.head_dim in [64, 80, 96, 128]
73
- and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids))
74
+ and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids,
75
+ num_heads, num_kv_heads))
74
76
  if use_cache:
75
77
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
76
78
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -132,7 +132,9 @@ def model_forward(
132
132
  return_dict: Optional[bool] = None,
133
133
  ):
134
134
  use_cache = use_cache if use_cache is not None else self.config.use_cache
135
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.c_fc, input_ids)
135
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
136
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.c_fc, input_ids,
137
+ num_heads, num_kv_heads)
136
138
  if use_cache:
137
139
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
138
140
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -19,7 +19,7 @@ import torch
19
19
  import warnings
20
20
  from ipex_llm.utils.common import invalidInputError
21
21
  from ipex_llm.ggml.quantize import ggml_tensor_qtype
22
- from ipex_llm.transformers.utils import get_ipex_version, get_xpu_device_name
22
+ from ipex_llm.transformers.utils import get_xpu_device_name
23
23
  from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8, FP8E5, IQ2_XXS, FP4, FP8E4,\
24
24
  FP6, ASYM_INT4
25
25
 
@@ -74,7 +74,8 @@ def append_kv_cache(cache_k, cache_v, key_states, value_states):
74
74
  return new_cache_k, new_cache_v
75
75
 
76
76
 
77
- def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: int = 1) -> bool:
77
+ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
78
+ num_heads: int, num_kv_heads: int) -> bool:
78
79
  if os.environ.get("BIGDL_QUANTIZE_KV_CACHE", None) is not None:
79
80
  warnings.warn(
80
81
  "`BIGDL_QUANTIZE_KV_CACHE` is deprecated and will be removed in future releases. "
@@ -90,8 +91,11 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: in
90
91
  else:
91
92
  device_name = get_xpu_device_name(x.device)
92
93
  return (
93
- device_name in ["mtl", "lnl", "arl"] and kv_group == 1
94
- or device_name in ["arc", "bmg"] and x.size(0) > 1
94
+ num_kv_heads >= 4
95
+ and (
96
+ device_name in ["mtl", "lnl", "arl"] and num_heads // num_kv_heads <= 4
97
+ or device_name in ["arc", "bmg"] and x.size(0) > 1
98
+ )
95
99
  )
96
100
 
97
101
 
@@ -168,7 +172,7 @@ def should_use_fuse_rope(hidden_states, position_ids, training):
168
172
 
169
173
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
170
174
  if model_family in ["llama", "baichuan", "internlm", "aquila", "gpt_neox", "mistral",
171
- "mixtral", "qwen2", "yuan", "stablelm", "qwen2_moe"]:
175
+ "qwen2", "yuan", "stablelm", "qwen2_moe"]:
172
176
  # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
173
177
  cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
174
178
  sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
@@ -183,7 +187,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
183
187
  q_embed = (q * cos) + (rotate_half(q) * sin)
184
188
  k_embed = (k * cos) + (rotate_half(k) * sin)
185
189
  return q_embed, k_embed
186
- elif model_family in ["gptj", "chatglm"]:
190
+ elif model_family in ["chatglm"]:
187
191
  q_embed = (q * cos) + (rotate_every_two(q) * sin)
188
192
  k_embed = (k * cos) + (rotate_every_two(k) * sin)
189
193
  return q_embed, k_embed
@@ -192,19 +196,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
192
196
  f"{model_family} is not supported.")
193
197
 
194
198
 
195
- def apply_ipex_rotate_every_two(q, k, cos, sin):
196
- # ipex's apply_rotary_embedding_two_qk can change the origin storage,
197
- # so q/k will get the result directly.
198
- from ipex_llm.transformers.utils import get_ipex_version
199
- if get_ipex_version() >= "2.1.10+xpu":
200
- torch.ops.torch_ipex.apply_rotary_embedding_two_qk(
201
- q, k, sin, cos, q, k
202
- )
203
- else:
204
- torch.ops.torch_ipex.apply_rotary_embedding(q, sin, cos, q)
205
- torch.ops.torch_ipex.apply_rotary_embedding(k, sin, cos, k)
206
-
207
-
208
199
  def is_enough_kv_cache_room_4_36(past_key_value, idx, seq_len=1):
209
200
  # to determinate if is enough kv cache room in transformers==4.36
210
201
  # seq_len for current seq len
@@ -158,7 +158,8 @@ def yuan_attention_forward(
158
158
  "yuan")
159
159
 
160
160
  # IPEX-LLM OPT: kv cache and quantzie kv cache
161
- use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states)
161
+ use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states,
162
+ self.num_heads, self.num_heads)
162
163
  key_states, value_states = update_past_key_value(
163
164
  None if past_key_value is None else (past_key_value[0], past_key_value[1]),
164
165
  key_states, value_states,
@@ -432,8 +432,7 @@ def _check_and_extend_kv_cache(past_key_values, max_step_draft, kv_alloc_block_l
432
432
  from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
433
433
  extend_kv_cache
434
434
  enough_kv_room = True
435
- if model_type not in ["chatglm", "qwen", "baichuan", "llama", "mistral",
436
- "gptj", "opt"]:
435
+ if model_type not in ["chatglm", "qwen", "baichuan", "llama", "mistral", "opt"]:
437
436
  return past_key_values, False
438
437
  cache_k = past_key_values[0][0]
439
438
  if model_type == "chatglm":
@@ -527,7 +526,7 @@ def _crop_past_key_values(self, past_key_values, new_cache_size, _enable_ipex=Fa
527
526
  v[:-(new_cache_size), :, :, :])
528
527
  for k, v in past_key_values
529
528
  ]
530
- elif self.config.model_type in ["baichuan", "gptj"]:
529
+ elif self.config.model_type in ["baichuan"]:
531
530
  past_key_values = [
532
531
  (k[:, :, :-(new_cache_size), :],
533
532
  v[:, :, :-(new_cache_size), :])
@@ -796,13 +795,6 @@ def _non_cpu_ipex_verify(self, verify_input_ids, past_key_values, cur_attention_
796
795
  device=verify_input_ids.device)
797
796
  position_ids = position_ids.unsqueeze(0).repeat(1, 1) + past_key_value_len
798
797
  forward_args["position_ids"] = position_ids
799
- elif self.config.model_type == "gptj":
800
- past_length = past_key_values[0][0].size(2)
801
- input_len = verify_input_ids.shape[1]
802
- position_ids = torch.arange(past_length, input_len + past_length,
803
- dtype=torch.long, device=verify_input_ids.device)
804
- position_ids = position_ids.unsqueeze(0).view(-1, input_len)
805
- forward_args["position_ids"] = position_ids
806
798
 
807
799
  return self(**forward_args)
808
800
 
@@ -971,10 +963,6 @@ def speculative_generate(self,
971
963
  past_key_value_len = past_key_values[0][0].shape[0]
972
964
  position_ids = torch.Tensor([[past_key_value_len + step_draft]]).long()
973
965
  forward_args["position_ids"] = position_ids
974
- elif self.config.model_type == "gptj":
975
- past_length = draft_past_key_values[0][0].size(2)
976
- position_ids = torch.Tensor([[past_length]]).long().to(self.device)
977
- forward_args["position_ids"] = position_ids
978
966
 
979
967
  if _enable_ipex:
980
968
  if any(keyword in self.config.model_type
@@ -154,24 +154,12 @@ def get_autocast_dtype(x):
154
154
  f"Device {x.device} is not supported.")
155
155
 
156
156
 
157
- _ipex_version = None
158
-
159
-
160
- def get_ipex_version():
161
-
162
- global _ipex_version
163
- if _ipex_version is not None:
164
- return _ipex_version
165
-
166
- import intel_extension_for_pytorch as ipex
167
- _ipex_version = ipex.__version__
168
- return _ipex_version
169
-
170
-
171
157
  def get_xpu_device_name(device: torch.device):
172
158
  if device.type != "xpu":
173
159
  return device.type
174
160
  else:
161
+ # possiable device name:
162
+ # ["arc", "pvc", "mtl", "lnl", "bmg", "arl", "legacy", "unknown"]
175
163
  import xe_linear
176
164
  return xe_linear.get_xpu_device_name(device)
177
165
 
@@ -20,9 +20,9 @@ import xe_batch
20
20
  import xe_addons
21
21
 
22
22
 
23
- @torch.library.register_fake("ipex_llm::forward_new")
24
- def _(x, weight, qtype, input_size):
25
- return torch.empty_like(x)
23
+ # @torch.library.register_fake("ipex_llm::forward_new")
24
+ # def _(x, weight, qtype, input_size):
25
+ # return ???
26
26
 
27
27
 
28
28
  # @torch.library.register_fake("ipex_llm::dequant")
@@ -32,32 +32,38 @@ def _(x, weight, qtype, input_size):
32
32
 
33
33
  @torch.library.register_fake("ipex_llm::mlp_forward_xpu")
34
34
  def _(x, weight1, weight2, batch_size, state_size, output_size, act_type, qtype):
35
- return torch.empty_like(x)
35
+ return torch.empty([batch_size, output_size],
36
+ dtype=x.dtype, device=x.device)
36
37
 
37
38
 
38
- # @torch.library.register_fake("ipex_llm::rwkv_linear_attention_v4")
39
- # def _(time_decay, time_first, key, value, num_state, den_state, max_state)
40
- # return ???
39
+ @torch.library.register_fake("ipex_llm::rwkv_linear_attention_v4")
40
+ def _(time_decay, time_first, key, value, num_state, den_state, max_state):
41
+ return torch.empty_like(key)
41
42
 
42
43
 
43
- # @torch.library.register_fake("ipex_llm::rwkv_linear_attention_v5")
44
- # def _(time_decay, time_first, receptance, key, value, state)
45
- # return ???
44
+ @torch.library.register_fake("ipex_llm::rwkv_linear_attention_v5")
45
+ def _(time_decay, time_first, receptance, key, value, state):
46
+ bsz, n_heads, seq_len, head_dim = key.shape
47
+ return torch.empty([bsz, seq_len, n_heads, head_dim],
48
+ dtype=key.dtype, device=key.device)
46
49
 
47
50
 
48
- # @torch.library.register_fake("ipex_llm::rwkv_time_shift")
49
- # def _(hidden, shifted, mix):
50
- # return ???
51
+ @torch.library.register_fake("ipex_llm::rwkv_time_shift")
52
+ def _(hidden, shifted, mix):
53
+ bsz, seq_len, hidden_size = hidden.shape
54
+ return torch.empty([mix.size(0), bsz, seq_len, hidden_size],
55
+ dtype=hidden.dtype, device=hidden.device)
51
56
 
52
57
 
53
- # @torch.library.register_fake("ipex_llm::dequantize_rows")
54
- # def _(x, weight, qtype, state_size, output_size):
55
- # return ???
58
+ @torch.library.register_fake("ipex_llm::dequantize_rows")
59
+ def _(x, weight, qtype, state_size, output_size):
60
+ return torch.empty([x.size(0), x.size(1), state_size],
61
+ dtype=torch.float, device=weight.device)
56
62
 
57
63
 
58
- @torch.library.register_fake("ipex_llm::batch_forward")
59
- def _(x, weight, qtype):
60
- return torch.empty_like(x)
64
+ # @torch.library.register_fake("ipex_llm::batch_forward")
65
+ # def _(x, weight, qtype):
66
+ # return ???
61
67
 
62
68
 
63
69
  @torch.library.register_fake("ipex_llm::sdp")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ipex-llm
3
- Version: 2.2.0b20250107
3
+ Version: 2.2.0b20250109
4
4
  Summary: Large Language Model Develop Toolkit
5
5
  Home-page: https://github.com/intel-analytics/ipex-llm
6
6
  Author: BigDL Authors
@@ -27,10 +27,10 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
27
27
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
28
28
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
29
29
  Provides-Extra: cpp
30
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250107 ; extra == 'cpp'
30
+ Requires-Dist: bigdl-core-cpp ==2.6.0b20250109 ; extra == 'cpp'
31
31
  Requires-Dist: setuptools ; extra == 'cpp'
32
32
  Provides-Extra: cpp-arl
33
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250107 ; extra == 'cpp-arl'
33
+ Requires-Dist: bigdl-core-cpp ==2.6.0b20250109 ; extra == 'cpp-arl'
34
34
  Requires-Dist: setuptools ; extra == 'cpp-arl'
35
35
  Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
36
36
  Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
@@ -67,7 +67,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
67
67
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
68
68
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
69
69
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
70
- Requires-Dist: bigdl-core-npu ==2.6.0b20250107 ; (platform_system == "Windows") and extra == 'npu'
70
+ Requires-Dist: bigdl-core-npu ==2.6.0b20250109 ; (platform_system == "Windows") and extra == 'npu'
71
71
  Provides-Extra: serving
72
72
  Requires-Dist: py-cpuinfo ; extra == 'serving'
73
73
  Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -87,9 +87,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
87
87
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
88
88
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
89
89
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
90
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250107 ; extra == 'xpu'
91
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250107 ; extra == 'xpu'
92
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250107 ; extra == 'xpu'
90
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250109 ; extra == 'xpu'
91
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250109 ; extra == 'xpu'
92
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250109 ; extra == 'xpu'
93
93
  Provides-Extra: xpu-2-1
94
94
  Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
95
95
  Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -104,9 +104,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
104
104
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
105
105
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
106
106
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
107
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250107 ; extra == 'xpu-2-1'
108
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250107 ; extra == 'xpu-2-1'
109
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250107 ; extra == 'xpu-2-1'
107
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
108
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
109
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
110
110
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
111
111
  Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
112
112
  Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -124,7 +124,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
124
124
  Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
125
125
  Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
126
126
  Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
127
- Requires-Dist: bigdl-core-xe-all ==2.6.0b20250107 ; extra == 'xpu-2-6'
127
+ Requires-Dist: bigdl-core-xe-all ==2.6.0b20250109 ; extra == 'xpu-2-6'
128
128
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-6'
129
129
  Provides-Extra: xpu-arc
130
130
  Requires-Dist: py-cpuinfo ; extra == 'xpu-arc'
@@ -137,9 +137,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
137
137
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
138
138
  Requires-Dist: tabulate ; extra == 'xpu-arc'
139
139
  Requires-Dist: setuptools ; extra == 'xpu-arc'
140
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250107 ; extra == 'xpu-arc'
141
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250107 ; extra == 'xpu-arc'
142
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250107 ; extra == 'xpu-arc'
140
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
141
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
142
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
143
143
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
144
144
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
145
145
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -160,9 +160,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
160
160
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
161
161
  Requires-Dist: tabulate ; extra == 'xpu-arl'
162
162
  Requires-Dist: setuptools ; extra == 'xpu-arl'
163
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250107 ; extra == 'xpu-arl'
164
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250107 ; extra == 'xpu-arl'
165
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250107 ; extra == 'xpu-arl'
163
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
164
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
165
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
166
166
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
167
167
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
168
168
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -183,9 +183,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
183
183
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
184
184
  Requires-Dist: tabulate ; extra == 'xpu-lnl'
185
185
  Requires-Dist: setuptools ; extra == 'xpu-lnl'
186
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250107 ; extra == 'xpu-lnl'
187
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250107 ; extra == 'xpu-lnl'
188
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250107 ; extra == 'xpu-lnl'
186
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
187
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
188
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
189
189
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
190
190
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
191
191
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'