ipex-llm 2.2.0b20250107__py3-none-win_amd64.whl → 2.2.0b20250109__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/convert.py +20 -50
- ipex_llm/transformers/loader.py +1 -1
- ipex_llm/transformers/low_bit_linear.py +10 -25
- ipex_llm/transformers/model.py +0 -7
- ipex_llm/transformers/models/baichuan.py +7 -36
- ipex_llm/transformers/models/bert.py +2 -13
- ipex_llm/transformers/models/chatglm2.py +8 -31
- ipex_llm/transformers/models/chatglm4.py +9 -4
- ipex_llm/transformers/models/chatglm4v.py +2 -1
- ipex_llm/transformers/models/common.py +3 -1
- ipex_llm/transformers/models/glm.py +4 -2
- ipex_llm/transformers/models/internlm.py +6 -3
- ipex_llm/transformers/models/llama.py +2 -2
- ipex_llm/transformers/models/minicpm.py +3 -2
- ipex_llm/transformers/models/minicpm3.py +3 -1
- ipex_llm/transformers/models/minicpmv.py +1 -0
- ipex_llm/transformers/models/mistral.py +1 -1
- ipex_llm/transformers/models/mllama.py +1 -1
- ipex_llm/transformers/models/phi3.py +6 -2
- ipex_llm/transformers/models/qwen.py +4 -2
- ipex_llm/transformers/models/qwen2.py +4 -3
- ipex_llm/transformers/models/qwen2_moe.py +4 -2
- ipex_llm/transformers/models/qwen2_vl.py +3 -1
- ipex_llm/transformers/models/stablelm.py +3 -1
- ipex_llm/transformers/models/starcoder2.py +3 -1
- ipex_llm/transformers/models/utils.py +10 -19
- ipex_llm/transformers/models/yuan.py +2 -1
- ipex_llm/transformers/speculative.py +2 -14
- ipex_llm/transformers/utils.py +2 -14
- ipex_llm/transformers/xpu_ops.py +25 -19
- {ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/METADATA +20 -20
- {ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/RECORD +67 -68
- ipex_llm/transformers/models/gptj.py +0 -441
- {ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250109.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250107.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250107.dist-info → ipex_llm-2.2.0b20250109.dist-info}/top_level.txt +0 -0
@@ -87,7 +87,8 @@ def internlm_attention_forward(
|
|
87
87
|
)
|
88
88
|
|
89
89
|
# IPEX-LLM OPT: kv cache and quantzie kv cache
|
90
|
-
use_quantize_kv = use_quantize_kv_cache(self.qkv_proj, hidden_states
|
90
|
+
use_quantize_kv = use_quantize_kv_cache(self.qkv_proj, hidden_states,
|
91
|
+
self.num_heads, self.num_heads)
|
91
92
|
key_states, value_states = update_past_key_value(
|
92
93
|
past_key_value, key_states, value_states,
|
93
94
|
kv_seq_len, use_quantize_kv, hidden_states.device
|
@@ -171,7 +172,8 @@ def internlm2_attention_forward(
|
|
171
172
|
)
|
172
173
|
|
173
174
|
# IPEX-LLM OPT: kv cache and quantzie kv cache
|
174
|
-
use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states
|
175
|
+
use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states,
|
176
|
+
self.num_heads, self.num_key_value_heads)
|
175
177
|
key_states, value_states = update_past_key_value(
|
176
178
|
past_key_value, key_states, value_states,
|
177
179
|
kv_seq_len, use_quantize_kv, hidden_states.device
|
@@ -346,7 +348,8 @@ def internlm_xcomposser2_attention_forward(
|
|
346
348
|
query_states, key_states, cos, sin, position_ids, "internlm")
|
347
349
|
|
348
350
|
# IPEX-LLM OPT: kv cache and quantzie kv cache
|
349
|
-
use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states
|
351
|
+
use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states,
|
352
|
+
self.num_heads, self.num_key_value_heads)
|
350
353
|
key_states, value_states = update_past_key_value(
|
351
354
|
past_key_value, key_states, value_states,
|
352
355
|
kv_seq_len, use_quantize_kv, device
|
@@ -72,7 +72,7 @@ def llama_model_forward(
|
|
72
72
|
use_cache = True if inputs.device.type == "xpu" else use_cache
|
73
73
|
use_quantize_kv = use_quantize_kv_cache(
|
74
74
|
self.layers[0].mlp.down_proj, inputs,
|
75
|
-
self.config.num_attention_heads
|
75
|
+
self.config.num_attention_heads, self.config.num_key_value_heads
|
76
76
|
)
|
77
77
|
use_compresskv = should_use_compresskv(inputs, inputs.shape[1]) or \
|
78
78
|
isinstance(past_key_values, DynamicCompressCache)
|
@@ -116,7 +116,7 @@ def llama_model_forward(
|
|
116
116
|
|
117
117
|
|
118
118
|
def merge_qkv(module: torch.nn.Module):
|
119
|
-
|
119
|
+
merge_qkv_base(module, LlamaAttention)
|
120
120
|
|
121
121
|
|
122
122
|
def llama_attention_forward(
|
@@ -51,7 +51,8 @@ from transformers.cache_utils import Cache
|
|
51
51
|
|
52
52
|
|
53
53
|
def merge_qkv(module: torch.nn.Module):
|
54
|
-
|
54
|
+
merge_qkv_base(module, "MiniCPMAttention")
|
55
|
+
merge_qkv_base(module, "MiniCPMSdpaAttention")
|
55
56
|
|
56
57
|
|
57
58
|
def apply_residual_scale(module: torch.nn.Module):
|
@@ -158,7 +159,7 @@ def minicpm_model_forward_wrapper(origin_forward):
|
|
158
159
|
# IPEX-LLM OPT: kv cache and quantize kv cache
|
159
160
|
inputs = input_ids if input_ids is not None else inputs_embeds
|
160
161
|
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
|
161
|
-
self.config.num_attention_heads
|
162
|
+
self.config.num_attention_heads,
|
162
163
|
self.config.num_key_value_heads)
|
163
164
|
use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
|
164
165
|
isinstance(past_key_values, DynamicCompressCache)
|
@@ -66,7 +66,9 @@ def minicpm3_model_forward_wrapper(origin_forward):
|
|
66
66
|
inputs = input_ids if input_ids is not None else inputs_embeds
|
67
67
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
68
68
|
use_cache = True if inputs.device.type == "xpu" else use_cache
|
69
|
-
|
69
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
70
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
|
71
|
+
num_heads, num_kv_heads)
|
70
72
|
if use_cache:
|
71
73
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
72
74
|
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
@@ -36,6 +36,7 @@ from transformers.generation.logits_process import RepetitionPenaltyLogitsProces
|
|
36
36
|
# MiniCPM-V-2_5 and MiniCPM-V-2_6
|
37
37
|
def merge_qkv(module: torch.nn.Module):
|
38
38
|
merge_qkv_base(module, "SiglipAttention")
|
39
|
+
merge_qkv_base(module, "SiglipSdpaAttention")
|
39
40
|
merge_qkv_base(module, "Idefics2VisionAttention")
|
40
41
|
|
41
42
|
|
@@ -71,7 +71,7 @@ def mistral_model_forward(
|
|
71
71
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
72
72
|
use_cache = use_cache or inputs.device.type == 'xpu'
|
73
73
|
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
|
74
|
-
self.config.num_attention_heads
|
74
|
+
self.config.num_attention_heads,
|
75
75
|
self.config.num_key_value_heads)
|
76
76
|
use_compress_kv = should_use_compresskv(inputs, inputs.size(1)) or \
|
77
77
|
isinstance(past_key_values, DynamicCompressCache)
|
@@ -113,7 +113,7 @@ def mllama_text_model_forward(
|
|
113
113
|
use_cache = True if inputs.device.type == "xpu" else use_cache
|
114
114
|
use_quantize_kv = use_quantize_kv_cache(
|
115
115
|
self.layers[0].mlp.down_proj, inputs,
|
116
|
-
self.config.num_attention_heads
|
116
|
+
self.config.num_attention_heads, self.config.num_key_value_heads
|
117
117
|
)
|
118
118
|
if use_cache:
|
119
119
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
@@ -249,7 +249,9 @@ def phi3_model_forward_wrapper(origin_model_forward):
|
|
249
249
|
# IPEX-LLM OPT: kv cache and quantize kv cache and sdp
|
250
250
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
251
251
|
inputs = input_ids if input_ids is not None else inputs_embeds
|
252
|
-
|
252
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
253
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
|
254
|
+
num_heads, num_kv_heads)
|
253
255
|
use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
|
254
256
|
isinstance(past_key_values, DynamicCompressCache)
|
255
257
|
if use_cache:
|
@@ -305,7 +307,9 @@ def phi3v_model_forward_wrapper(origin_model_forward):
|
|
305
307
|
):
|
306
308
|
# IPEX-LLM OPT: kv cache and quantize kv cache and sdp
|
307
309
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
308
|
-
|
310
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
311
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, input_ids,
|
312
|
+
num_heads, num_kv_heads)
|
309
313
|
if use_cache:
|
310
314
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
311
315
|
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
@@ -107,7 +107,8 @@ def qwen_attention_forward(
|
|
107
107
|
query_states = query_states * logn_tensor.type_as(query_states).expand_as(query_states)
|
108
108
|
|
109
109
|
# IPEX-LLM OPT: kv cache and quantzie kv cache
|
110
|
-
use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states
|
110
|
+
use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states,
|
111
|
+
self.num_heads, self.num_heads)
|
111
112
|
key_states, value_states = update_past_key_value(
|
112
113
|
past_key_value, key_states, value_states,
|
113
114
|
kv_seq_len, use_quantize_kv, device
|
@@ -205,7 +206,8 @@ def qwen_attention_forward_registered(
|
|
205
206
|
query_states = query_states * logn_tensor.type_as(query_states).expand_as(query_states)
|
206
207
|
|
207
208
|
# IPEX-LLM OPT: kv cache and quantzie kv cache
|
208
|
-
use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states
|
209
|
+
use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states,
|
210
|
+
self.num_heads, self.num_heads)
|
209
211
|
key_states, value_states = update_past_key_value(
|
210
212
|
past_key_value, key_states, value_states,
|
211
213
|
kv_seq_len, use_quantize_kv, device
|
@@ -113,10 +113,10 @@ def qwen2_model_forward(
|
|
113
113
|
# ipex-llm changes start
|
114
114
|
# IPEX-LLM OPT: kv cache and quantize kv cache
|
115
115
|
inputs = input_ids if input_ids is not None else inputs_embeds
|
116
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
116
117
|
use_quantize_kv = (
|
117
118
|
self.config.hidden_size != 3584 # disable quantize kv in specific model
|
118
|
-
and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
|
119
|
-
self.config.num_attention_heads//self.config.num_key_value_heads)
|
119
|
+
and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs, num_heads, num_kv_heads)
|
120
120
|
)
|
121
121
|
use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
|
122
122
|
isinstance(past_key_values, DynamicCompressCache)
|
@@ -305,10 +305,11 @@ def qwen2_model_forward_4_42(
|
|
305
305
|
|
306
306
|
# ipex-llm changes start
|
307
307
|
# IPEX-LLM OPT: kv cache and quantize kv cache
|
308
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
308
309
|
use_quantize_kv = (
|
309
310
|
self.config.hidden_size != 3584 # disable quantize kv in specific model
|
310
311
|
and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs_embeds,
|
311
|
-
|
312
|
+
num_heads, num_kv_heads)
|
312
313
|
)
|
313
314
|
use_compress_kv = should_use_compresskv(inputs_embeds, inputs_embeds.shape[1]) or \
|
314
315
|
isinstance(past_key_values, DynamicCompressCache)
|
@@ -73,8 +73,10 @@ def qwen2moe_model_forward(
|
|
73
73
|
return_dict: Optional[bool] = None,
|
74
74
|
):
|
75
75
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
76
|
-
|
77
|
-
|
76
|
+
inputs = input_ids if input_ids is not None else inputs_embeds
|
77
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
78
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, inputs,
|
79
|
+
num_heads, num_kv_heads)
|
78
80
|
if use_cache:
|
79
81
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
80
82
|
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
@@ -88,7 +88,9 @@ def qwen2_vl_model_forward(
|
|
88
88
|
# IPEX-LLM OPT start: kv cache and quantize kv cache
|
89
89
|
inputs = input_ids if input_ids is not None else inputs_embeds
|
90
90
|
use_cache = True if inputs.device.type == "xpu" else use_cache
|
91
|
-
|
91
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
92
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
|
93
|
+
num_heads, num_kv_heads)
|
92
94
|
if use_cache:
|
93
95
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
94
96
|
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
@@ -69,8 +69,10 @@ def stablelm_model_forward(
|
|
69
69
|
):
|
70
70
|
# IPEX-LLM OPT: kv cache and quantize kv cache
|
71
71
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
72
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
72
73
|
use_quantize_kv = (self.layers[0].self_attn.head_dim in [64, 80, 96, 128]
|
73
|
-
and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids
|
74
|
+
and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids,
|
75
|
+
num_heads, num_kv_heads))
|
74
76
|
if use_cache:
|
75
77
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
76
78
|
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
@@ -132,7 +132,9 @@ def model_forward(
|
|
132
132
|
return_dict: Optional[bool] = None,
|
133
133
|
):
|
134
134
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
135
|
-
|
135
|
+
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
136
|
+
use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.c_fc, input_ids,
|
137
|
+
num_heads, num_kv_heads)
|
136
138
|
if use_cache:
|
137
139
|
if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
|
138
140
|
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
@@ -19,7 +19,7 @@ import torch
|
|
19
19
|
import warnings
|
20
20
|
from ipex_llm.utils.common import invalidInputError
|
21
21
|
from ipex_llm.ggml.quantize import ggml_tensor_qtype
|
22
|
-
from ipex_llm.transformers.utils import
|
22
|
+
from ipex_llm.transformers.utils import get_xpu_device_name
|
23
23
|
from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8, FP8E5, IQ2_XXS, FP4, FP8E4,\
|
24
24
|
FP6, ASYM_INT4
|
25
25
|
|
@@ -74,7 +74,8 @@ def append_kv_cache(cache_k, cache_v, key_states, value_states):
|
|
74
74
|
return new_cache_k, new_cache_v
|
75
75
|
|
76
76
|
|
77
|
-
def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
|
77
|
+
def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
|
78
|
+
num_heads: int, num_kv_heads: int) -> bool:
|
78
79
|
if os.environ.get("BIGDL_QUANTIZE_KV_CACHE", None) is not None:
|
79
80
|
warnings.warn(
|
80
81
|
"`BIGDL_QUANTIZE_KV_CACHE` is deprecated and will be removed in future releases. "
|
@@ -90,8 +91,11 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: in
|
|
90
91
|
else:
|
91
92
|
device_name = get_xpu_device_name(x.device)
|
92
93
|
return (
|
93
|
-
|
94
|
-
|
94
|
+
num_kv_heads >= 4
|
95
|
+
and (
|
96
|
+
device_name in ["mtl", "lnl", "arl"] and num_heads // num_kv_heads <= 4
|
97
|
+
or device_name in ["arc", "bmg"] and x.size(0) > 1
|
98
|
+
)
|
95
99
|
)
|
96
100
|
|
97
101
|
|
@@ -168,7 +172,7 @@ def should_use_fuse_rope(hidden_states, position_ids, training):
|
|
168
172
|
|
169
173
|
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
|
170
174
|
if model_family in ["llama", "baichuan", "internlm", "aquila", "gpt_neox", "mistral",
|
171
|
-
"
|
175
|
+
"qwen2", "yuan", "stablelm", "qwen2_moe"]:
|
172
176
|
# The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
|
173
177
|
cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
|
174
178
|
sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
|
@@ -183,7 +187,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
|
|
183
187
|
q_embed = (q * cos) + (rotate_half(q) * sin)
|
184
188
|
k_embed = (k * cos) + (rotate_half(k) * sin)
|
185
189
|
return q_embed, k_embed
|
186
|
-
elif model_family in ["
|
190
|
+
elif model_family in ["chatglm"]:
|
187
191
|
q_embed = (q * cos) + (rotate_every_two(q) * sin)
|
188
192
|
k_embed = (k * cos) + (rotate_every_two(k) * sin)
|
189
193
|
return q_embed, k_embed
|
@@ -192,19 +196,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, model_family):
|
|
192
196
|
f"{model_family} is not supported.")
|
193
197
|
|
194
198
|
|
195
|
-
def apply_ipex_rotate_every_two(q, k, cos, sin):
|
196
|
-
# ipex's apply_rotary_embedding_two_qk can change the origin storage,
|
197
|
-
# so q/k will get the result directly.
|
198
|
-
from ipex_llm.transformers.utils import get_ipex_version
|
199
|
-
if get_ipex_version() >= "2.1.10+xpu":
|
200
|
-
torch.ops.torch_ipex.apply_rotary_embedding_two_qk(
|
201
|
-
q, k, sin, cos, q, k
|
202
|
-
)
|
203
|
-
else:
|
204
|
-
torch.ops.torch_ipex.apply_rotary_embedding(q, sin, cos, q)
|
205
|
-
torch.ops.torch_ipex.apply_rotary_embedding(k, sin, cos, k)
|
206
|
-
|
207
|
-
|
208
199
|
def is_enough_kv_cache_room_4_36(past_key_value, idx, seq_len=1):
|
209
200
|
# to determinate if is enough kv cache room in transformers==4.36
|
210
201
|
# seq_len for current seq len
|
@@ -158,7 +158,8 @@ def yuan_attention_forward(
|
|
158
158
|
"yuan")
|
159
159
|
|
160
160
|
# IPEX-LLM OPT: kv cache and quantzie kv cache
|
161
|
-
use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states
|
161
|
+
use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states,
|
162
|
+
self.num_heads, self.num_heads)
|
162
163
|
key_states, value_states = update_past_key_value(
|
163
164
|
None if past_key_value is None else (past_key_value[0], past_key_value[1]),
|
164
165
|
key_states, value_states,
|
@@ -432,8 +432,7 @@ def _check_and_extend_kv_cache(past_key_values, max_step_draft, kv_alloc_block_l
|
|
432
432
|
from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \
|
433
433
|
extend_kv_cache
|
434
434
|
enough_kv_room = True
|
435
|
-
if model_type not in ["chatglm", "qwen", "baichuan", "llama", "mistral",
|
436
|
-
"gptj", "opt"]:
|
435
|
+
if model_type not in ["chatglm", "qwen", "baichuan", "llama", "mistral", "opt"]:
|
437
436
|
return past_key_values, False
|
438
437
|
cache_k = past_key_values[0][0]
|
439
438
|
if model_type == "chatglm":
|
@@ -527,7 +526,7 @@ def _crop_past_key_values(self, past_key_values, new_cache_size, _enable_ipex=Fa
|
|
527
526
|
v[:-(new_cache_size), :, :, :])
|
528
527
|
for k, v in past_key_values
|
529
528
|
]
|
530
|
-
elif self.config.model_type in ["baichuan"
|
529
|
+
elif self.config.model_type in ["baichuan"]:
|
531
530
|
past_key_values = [
|
532
531
|
(k[:, :, :-(new_cache_size), :],
|
533
532
|
v[:, :, :-(new_cache_size), :])
|
@@ -796,13 +795,6 @@ def _non_cpu_ipex_verify(self, verify_input_ids, past_key_values, cur_attention_
|
|
796
795
|
device=verify_input_ids.device)
|
797
796
|
position_ids = position_ids.unsqueeze(0).repeat(1, 1) + past_key_value_len
|
798
797
|
forward_args["position_ids"] = position_ids
|
799
|
-
elif self.config.model_type == "gptj":
|
800
|
-
past_length = past_key_values[0][0].size(2)
|
801
|
-
input_len = verify_input_ids.shape[1]
|
802
|
-
position_ids = torch.arange(past_length, input_len + past_length,
|
803
|
-
dtype=torch.long, device=verify_input_ids.device)
|
804
|
-
position_ids = position_ids.unsqueeze(0).view(-1, input_len)
|
805
|
-
forward_args["position_ids"] = position_ids
|
806
798
|
|
807
799
|
return self(**forward_args)
|
808
800
|
|
@@ -971,10 +963,6 @@ def speculative_generate(self,
|
|
971
963
|
past_key_value_len = past_key_values[0][0].shape[0]
|
972
964
|
position_ids = torch.Tensor([[past_key_value_len + step_draft]]).long()
|
973
965
|
forward_args["position_ids"] = position_ids
|
974
|
-
elif self.config.model_type == "gptj":
|
975
|
-
past_length = draft_past_key_values[0][0].size(2)
|
976
|
-
position_ids = torch.Tensor([[past_length]]).long().to(self.device)
|
977
|
-
forward_args["position_ids"] = position_ids
|
978
966
|
|
979
967
|
if _enable_ipex:
|
980
968
|
if any(keyword in self.config.model_type
|
ipex_llm/transformers/utils.py
CHANGED
@@ -154,24 +154,12 @@ def get_autocast_dtype(x):
|
|
154
154
|
f"Device {x.device} is not supported.")
|
155
155
|
|
156
156
|
|
157
|
-
_ipex_version = None
|
158
|
-
|
159
|
-
|
160
|
-
def get_ipex_version():
|
161
|
-
|
162
|
-
global _ipex_version
|
163
|
-
if _ipex_version is not None:
|
164
|
-
return _ipex_version
|
165
|
-
|
166
|
-
import intel_extension_for_pytorch as ipex
|
167
|
-
_ipex_version = ipex.__version__
|
168
|
-
return _ipex_version
|
169
|
-
|
170
|
-
|
171
157
|
def get_xpu_device_name(device: torch.device):
|
172
158
|
if device.type != "xpu":
|
173
159
|
return device.type
|
174
160
|
else:
|
161
|
+
# possiable device name:
|
162
|
+
# ["arc", "pvc", "mtl", "lnl", "bmg", "arl", "legacy", "unknown"]
|
175
163
|
import xe_linear
|
176
164
|
return xe_linear.get_xpu_device_name(device)
|
177
165
|
|
ipex_llm/transformers/xpu_ops.py
CHANGED
@@ -20,9 +20,9 @@ import xe_batch
|
|
20
20
|
import xe_addons
|
21
21
|
|
22
22
|
|
23
|
-
@torch.library.register_fake("ipex_llm::forward_new")
|
24
|
-
def _(x, weight, qtype, input_size):
|
25
|
-
|
23
|
+
# @torch.library.register_fake("ipex_llm::forward_new")
|
24
|
+
# def _(x, weight, qtype, input_size):
|
25
|
+
# return ???
|
26
26
|
|
27
27
|
|
28
28
|
# @torch.library.register_fake("ipex_llm::dequant")
|
@@ -32,32 +32,38 @@ def _(x, weight, qtype, input_size):
|
|
32
32
|
|
33
33
|
@torch.library.register_fake("ipex_llm::mlp_forward_xpu")
|
34
34
|
def _(x, weight1, weight2, batch_size, state_size, output_size, act_type, qtype):
|
35
|
-
return torch.
|
35
|
+
return torch.empty([batch_size, output_size],
|
36
|
+
dtype=x.dtype, device=x.device)
|
36
37
|
|
37
38
|
|
38
|
-
|
39
|
-
|
40
|
-
|
39
|
+
@torch.library.register_fake("ipex_llm::rwkv_linear_attention_v4")
|
40
|
+
def _(time_decay, time_first, key, value, num_state, den_state, max_state):
|
41
|
+
return torch.empty_like(key)
|
41
42
|
|
42
43
|
|
43
|
-
|
44
|
-
|
45
|
-
|
44
|
+
@torch.library.register_fake("ipex_llm::rwkv_linear_attention_v5")
|
45
|
+
def _(time_decay, time_first, receptance, key, value, state):
|
46
|
+
bsz, n_heads, seq_len, head_dim = key.shape
|
47
|
+
return torch.empty([bsz, seq_len, n_heads, head_dim],
|
48
|
+
dtype=key.dtype, device=key.device)
|
46
49
|
|
47
50
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
+
@torch.library.register_fake("ipex_llm::rwkv_time_shift")
|
52
|
+
def _(hidden, shifted, mix):
|
53
|
+
bsz, seq_len, hidden_size = hidden.shape
|
54
|
+
return torch.empty([mix.size(0), bsz, seq_len, hidden_size],
|
55
|
+
dtype=hidden.dtype, device=hidden.device)
|
51
56
|
|
52
57
|
|
53
|
-
|
54
|
-
|
55
|
-
|
58
|
+
@torch.library.register_fake("ipex_llm::dequantize_rows")
|
59
|
+
def _(x, weight, qtype, state_size, output_size):
|
60
|
+
return torch.empty([x.size(0), x.size(1), state_size],
|
61
|
+
dtype=torch.float, device=weight.device)
|
56
62
|
|
57
63
|
|
58
|
-
@torch.library.register_fake("ipex_llm::batch_forward")
|
59
|
-
def _(x, weight, qtype):
|
60
|
-
|
64
|
+
# @torch.library.register_fake("ipex_llm::batch_forward")
|
65
|
+
# def _(x, weight, qtype):
|
66
|
+
# return ???
|
61
67
|
|
62
68
|
|
63
69
|
@torch.library.register_fake("ipex_llm::sdp")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ipex-llm
|
3
|
-
Version: 2.2.
|
3
|
+
Version: 2.2.0b20250109
|
4
4
|
Summary: Large Language Model Develop Toolkit
|
5
5
|
Home-page: https://github.com/intel-analytics/ipex-llm
|
6
6
|
Author: BigDL Authors
|
@@ -27,10 +27,10 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
|
|
27
27
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
|
28
28
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
|
29
29
|
Provides-Extra: cpp
|
30
|
-
Requires-Dist: bigdl-core-cpp ==2.6.
|
30
|
+
Requires-Dist: bigdl-core-cpp ==2.6.0b20250109 ; extra == 'cpp'
|
31
31
|
Requires-Dist: setuptools ; extra == 'cpp'
|
32
32
|
Provides-Extra: cpp-arl
|
33
|
-
Requires-Dist: bigdl-core-cpp ==2.6.
|
33
|
+
Requires-Dist: bigdl-core-cpp ==2.6.0b20250109 ; extra == 'cpp-arl'
|
34
34
|
Requires-Dist: setuptools ; extra == 'cpp-arl'
|
35
35
|
Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
36
36
|
Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
@@ -67,7 +67,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
|
|
67
67
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
|
68
68
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
|
69
69
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
|
70
|
-
Requires-Dist: bigdl-core-npu ==2.6.
|
70
|
+
Requires-Dist: bigdl-core-npu ==2.6.0b20250109 ; (platform_system == "Windows") and extra == 'npu'
|
71
71
|
Provides-Extra: serving
|
72
72
|
Requires-Dist: py-cpuinfo ; extra == 'serving'
|
73
73
|
Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
|
@@ -87,9 +87,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
|
|
87
87
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
|
88
88
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
|
89
89
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
|
90
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
91
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
92
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
90
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250109 ; extra == 'xpu'
|
91
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250109 ; extra == 'xpu'
|
92
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250109 ; extra == 'xpu'
|
93
93
|
Provides-Extra: xpu-2-1
|
94
94
|
Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
|
95
95
|
Requires-Dist: protobuf ; extra == 'xpu-2-1'
|
@@ -104,9 +104,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
|
|
104
104
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
|
105
105
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
|
106
106
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
|
107
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
108
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
109
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
107
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
|
108
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
|
109
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
|
110
110
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
|
111
111
|
Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
112
112
|
Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
@@ -124,7 +124,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
|
|
124
124
|
Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
|
125
125
|
Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
|
126
126
|
Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
|
127
|
-
Requires-Dist: bigdl-core-xe-all ==2.6.
|
127
|
+
Requires-Dist: bigdl-core-xe-all ==2.6.0b20250109 ; extra == 'xpu-2-6'
|
128
128
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-6'
|
129
129
|
Provides-Extra: xpu-arc
|
130
130
|
Requires-Dist: py-cpuinfo ; extra == 'xpu-arc'
|
@@ -137,9 +137,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
|
|
137
137
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
|
138
138
|
Requires-Dist: tabulate ; extra == 'xpu-arc'
|
139
139
|
Requires-Dist: setuptools ; extra == 'xpu-arc'
|
140
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
141
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
142
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
140
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
|
141
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
|
142
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
|
143
143
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
|
144
144
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
145
145
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
@@ -160,9 +160,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
|
|
160
160
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
|
161
161
|
Requires-Dist: tabulate ; extra == 'xpu-arl'
|
162
162
|
Requires-Dist: setuptools ; extra == 'xpu-arl'
|
163
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
164
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
165
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
163
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
|
164
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
|
165
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
|
166
166
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
|
167
167
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
168
168
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
@@ -183,9 +183,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
|
|
183
183
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
|
184
184
|
Requires-Dist: tabulate ; extra == 'xpu-lnl'
|
185
185
|
Requires-Dist: setuptools ; extra == 'xpu-lnl'
|
186
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
187
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
188
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
186
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
|
187
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
|
188
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
|
189
189
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
|
190
190
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|
191
191
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|