ipex-llm 2.2.0b20250109__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250111__py3-none-manylinux2010_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/transformers/convert.py +4 -9
- ipex_llm/transformers/convert_ipex.py +8 -1
- ipex_llm/transformers/low_bit_linear.py +5 -5
- ipex_llm/transformers/models/baichuan.py +1 -2
- ipex_llm/transformers/models/internlm.py +0 -15
- ipex_llm/transformers/models/phi3.py +2 -19
- ipex_llm/transformers/models/qwen2.py +25 -310
- ipex_llm/transformers/models/utils.py +0 -20
- ipex_llm/transformers/npu_model.py +7 -3
- {ipex_llm-2.2.0b20250109.dist-info → ipex_llm-2.2.0b20250111.dist-info}/METADATA +20 -20
- {ipex_llm-2.2.0b20250109.dist-info → ipex_llm-2.2.0b20250111.dist-info}/RECORD +17 -17
- {ipex_llm-2.2.0b20250109.data → ipex_llm-2.2.0b20250111.data}/scripts/ipex-llm-init +0 -0
- {ipex_llm-2.2.0b20250109.data → ipex_llm-2.2.0b20250111.data}/scripts/llm-chat +0 -0
- {ipex_llm-2.2.0b20250109.data → ipex_llm-2.2.0b20250111.data}/scripts/llm-cli +0 -0
- {ipex_llm-2.2.0b20250109.dist-info → ipex_llm-2.2.0b20250111.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250109.dist-info → ipex_llm-2.2.0b20250111.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250109.dist-info → ipex_llm-2.2.0b20250111.dist-info}/top_level.txt +0 -0
ipex_llm/transformers/convert.py
CHANGED
@@ -1590,6 +1590,9 @@ def _optimize_post(model):
|
|
1590
1590
|
convert_forward(model,
|
1591
1591
|
module.Qwen2ForCausalLM,
|
1592
1592
|
qwen2_causal_lm_forward)
|
1593
|
+
convert_forward(model,
|
1594
|
+
module.Qwen2Model,
|
1595
|
+
qwen2_model_forward)
|
1593
1596
|
convert_forward(model,
|
1594
1597
|
module.Qwen2RMSNorm,
|
1595
1598
|
rms_norm_forward)
|
@@ -1602,12 +1605,6 @@ def _optimize_post(model):
|
|
1602
1605
|
convert_forward(model,
|
1603
1606
|
module.Qwen2SdpaAttention,
|
1604
1607
|
qwen2_attention_forward)
|
1605
|
-
if version.parse(trans_version) >= version.parse("4.42"):
|
1606
|
-
from ipex_llm.transformers.models.qwen2 import qwen2_model_forward_4_42
|
1607
|
-
convert_forward(model, module.Qwen2Model, qwen2_model_forward_4_42)
|
1608
|
-
else:
|
1609
|
-
from ipex_llm.transformers.models.qwen2 import qwen2_model_forward
|
1610
|
-
convert_forward(model, module.Qwen2Model, qwen2_model_forward)
|
1611
1608
|
elif model.config.model_type == "qwen2_moe":
|
1612
1609
|
# for Qwen1.5-MOE-A2.7B
|
1613
1610
|
modeling_module_name = model.__class__.__module__
|
@@ -1819,9 +1816,7 @@ def _optimize_post(model):
|
|
1819
1816
|
from ipex_llm.transformers.models.phi3 import attention_forward
|
1820
1817
|
convert_forward(model, module.Phi3Attention, attention_forward)
|
1821
1818
|
convert_forward(model, module.Phi3SdpaAttention, attention_forward)
|
1822
|
-
|
1823
|
-
convert_forward(model, module.Phi3MLP, mlp_forward)
|
1824
|
-
from ipex_llm.transformers.models.common import rms_norm_forward
|
1819
|
+
convert_forward(model, module.Phi3MLP, mlp_silu_forward)
|
1825
1820
|
convert_forward(model, module.Phi3RMSNorm, rms_norm_forward)
|
1826
1821
|
if model.config.model_type == "phi3":
|
1827
1822
|
from ipex_llm.transformers.models.phi3 import phi3_model_forward_wrapper
|
@@ -52,7 +52,14 @@ import os
|
|
52
52
|
|
53
53
|
|
54
54
|
def _ipex_optimize_rmsnorm(_model, supported_classes, is_tpp=False, is_woq=False):
|
55
|
-
|
55
|
+
try:
|
56
|
+
# old version use name `_IPEXRMSNorm`
|
57
|
+
from intel_extension_for_pytorch.transformers.models.cpu.fusions.mha_fusion \
|
58
|
+
import _IPEXRMSNorm
|
59
|
+
except ImportError:
|
60
|
+
# new version use name `_IPEXRMSNormCPU`
|
61
|
+
from intel_extension_for_pytorch.transformers.models.cpu.fusions.mha_fusion \
|
62
|
+
import _IPEXRMSNormCPU as _IPEXRMSNorm
|
56
63
|
for supported_class in supported_classes:
|
57
64
|
lowering_class_cpu(
|
58
65
|
_model,
|
@@ -47,7 +47,7 @@ import os
|
|
47
47
|
import torch
|
48
48
|
import torch.distributed
|
49
49
|
import torch.nn.functional as F
|
50
|
-
from torch import Tensor,
|
50
|
+
from torch import Tensor, dtype, nn
|
51
51
|
from operator import mul
|
52
52
|
from functools import reduce
|
53
53
|
from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
|
@@ -294,10 +294,10 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
|
|
294
294
|
if hard_condition:
|
295
295
|
return (
|
296
296
|
batch_size > 1
|
297
|
-
or (
|
298
|
-
or (
|
299
|
-
or (
|
300
|
-
or (
|
297
|
+
or (device_name in ["arc"] and qtype in [SYM_INT8, FP4])
|
298
|
+
or (device_name in ["arc", "mtl"] and qtype in [FP8E4])
|
299
|
+
or (device_name in ["lnl"] and qtype in [SYM_INT4] and x.shape[1] % 512 == 0)
|
300
|
+
or (device_name in ["bmg"] and qtype in [SYM_INT4, FP8E5])
|
301
301
|
)
|
302
302
|
return False
|
303
303
|
|
@@ -30,8 +30,7 @@ from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp
|
|
30
30
|
from ipex_llm.transformers.models.utils import update_past_key_value
|
31
31
|
from ipex_llm.transformers.models.utils import should_use_fuse_rope
|
32
32
|
from ipex_llm.transformers.models.utils import use_sdp
|
33
|
-
from ipex_llm.transformers.models.utils import apply_rotary_pos_emb
|
34
|
-
from ipex_llm.transformers.models.utils import mlp_fusion_check
|
33
|
+
from ipex_llm.transformers.models.utils import apply_rotary_pos_emb
|
35
34
|
from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_36
|
36
35
|
from ipex_llm.transformers.kv import DynamicCompressFp8Cache, DynamicCompressCache
|
37
36
|
import warnings
|
@@ -113,21 +113,6 @@ def internlm_attention_forward(
|
|
113
113
|
return attn_output, attn_weights, past_key_value
|
114
114
|
|
115
115
|
|
116
|
-
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
117
|
-
"""
|
118
|
-
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
|
119
|
-
The hidden states go from (batch,
|
120
|
-
num_key_value_heads, seqlen, head_dim) to
|
121
|
-
(batch, num_attention_heads, seqlen, head_dim)
|
122
|
-
"""
|
123
|
-
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
|
124
|
-
if n_rep == 1:
|
125
|
-
return hidden_states
|
126
|
-
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads,
|
127
|
-
n_rep, slen, head_dim)
|
128
|
-
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
129
|
-
|
130
|
-
|
131
116
|
def internlm2_attention_forward(
|
132
117
|
self,
|
133
118
|
hidden_states: torch.Tensor,
|
@@ -39,7 +39,6 @@ import warnings
|
|
39
39
|
from ipex_llm.transformers.models.common import attention_softmax
|
40
40
|
from ipex_llm.transformers.models.common import scaled_dot_product_attention
|
41
41
|
from ipex_llm.transformers.models.utils import should_use_fuse_rope, rotate_half
|
42
|
-
from ipex_llm.transformers.models.utils import mlp_fusion_check, SILU
|
43
42
|
from ipex_llm.transformers.models.utils import use_sdp, use_sdp_causal
|
44
43
|
from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache
|
45
44
|
from ipex_llm.transformers.models.utils import should_use_compresskv, is_enough_kv_cache_room_4_36
|
@@ -213,24 +212,8 @@ def split_mlp(module: torch.nn.Module):
|
|
213
212
|
|
214
213
|
del module.gate_up_proj
|
215
214
|
|
216
|
-
|
217
|
-
|
218
|
-
self,
|
219
|
-
hidden_states: torch.FloatTensor
|
220
|
-
) -> torch.FloatTensor:
|
221
|
-
x_2d = hidden_states.view(-1, hidden_states.shape[-1])
|
222
|
-
qtype = getattr(self.gate_proj, "qtype", None)
|
223
|
-
if mlp_fusion_check(x_2d, qtype, self.training):
|
224
|
-
x_2d = x_2d.contiguous()
|
225
|
-
import xe_linear
|
226
|
-
return self.down_proj(xe_linear.mlp_forward_xpu(
|
227
|
-
x_2d, self.gate_proj.weight.data, self.up_proj.weight.data,
|
228
|
-
x_2d.shape[0], x_2d.shape[1], self.gate_proj.out_features,
|
229
|
-
SILU, qtype
|
230
|
-
))
|
231
|
-
return self.down_proj(
|
232
|
-
self.activation_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states)
|
233
|
-
)
|
215
|
+
# rename activation function
|
216
|
+
module.act_fn = module.activation_fn
|
234
217
|
|
235
218
|
|
236
219
|
def phi3_model_forward_wrapper(origin_model_forward):
|
@@ -51,217 +51,14 @@ from ipex_llm.transformers.models.utils import use_quantize_kv_cache, \
|
|
51
51
|
should_use_compresskv, is_enough_kv_cache_room_4_36
|
52
52
|
from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache, \
|
53
53
|
DynamicCompressCache, DynamicCompressFp8Cache
|
54
|
-
from ipex_llm.utils.common import invalidInputError
|
55
54
|
|
56
|
-
from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2MLP
|
55
|
+
from transformers.models.qwen2.modeling_qwen2 import Qwen2Model, Qwen2Attention, Qwen2MLP
|
57
56
|
from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb
|
58
57
|
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
59
58
|
from transformers.cache_utils import Cache
|
60
|
-
from transformers import logging
|
61
|
-
|
62
|
-
|
63
|
-
logger = logging.get_logger(__name__)
|
64
59
|
|
65
60
|
|
66
61
|
def qwen2_model_forward(
|
67
|
-
self,
|
68
|
-
input_ids: torch.LongTensor = None,
|
69
|
-
attention_mask: Optional[torch.Tensor] = None,
|
70
|
-
position_ids: Optional[torch.LongTensor] = None,
|
71
|
-
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
72
|
-
inputs_embeds: Optional[torch.FloatTensor] = None,
|
73
|
-
use_cache: Optional[bool] = None,
|
74
|
-
output_attentions: Optional[bool] = None,
|
75
|
-
output_hidden_states: Optional[bool] = None,
|
76
|
-
return_dict: Optional[bool] = None,
|
77
|
-
cache_position: Optional[torch.LongTensor] = None, # for transformers >= 4.42
|
78
|
-
) -> Union[Tuple, BaseModelOutputWithPast]:
|
79
|
-
output_attentions = (
|
80
|
-
output_attentions if output_attentions is not None
|
81
|
-
else self.config.output_attentions
|
82
|
-
)
|
83
|
-
output_hidden_states = (
|
84
|
-
output_hidden_states if output_hidden_states is not None
|
85
|
-
else self.config.output_hidden_states
|
86
|
-
)
|
87
|
-
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
88
|
-
|
89
|
-
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
90
|
-
|
91
|
-
# retrieve input_ids and inputs_embeds
|
92
|
-
if input_ids is not None and inputs_embeds is not None:
|
93
|
-
invalidInputError(False,
|
94
|
-
"You cannot specify both input_ids and inputs_embeds at the same time")
|
95
|
-
elif input_ids is not None:
|
96
|
-
batch_size, seq_length = input_ids.shape
|
97
|
-
elif inputs_embeds is not None:
|
98
|
-
batch_size, seq_length, _ = inputs_embeds.shape
|
99
|
-
else:
|
100
|
-
invalidInputError(False,
|
101
|
-
"You have to specify either decoder_input_ids or decoder_inputs_embeds")
|
102
|
-
|
103
|
-
if self.gradient_checkpointing and self.training:
|
104
|
-
if use_cache:
|
105
|
-
logger.warning_once(
|
106
|
-
"`use_cache=True` is incompatible with gradient checkpointing. "
|
107
|
-
"Setting `use_cache=False`..."
|
108
|
-
)
|
109
|
-
use_cache = False
|
110
|
-
|
111
|
-
past_key_values_length = 0
|
112
|
-
|
113
|
-
# ipex-llm changes start
|
114
|
-
# IPEX-LLM OPT: kv cache and quantize kv cache
|
115
|
-
inputs = input_ids if input_ids is not None else inputs_embeds
|
116
|
-
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
117
|
-
use_quantize_kv = (
|
118
|
-
self.config.hidden_size != 3584 # disable quantize kv in specific model
|
119
|
-
and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs, num_heads, num_kv_heads)
|
120
|
-
)
|
121
|
-
use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
|
122
|
-
isinstance(past_key_values, DynamicCompressCache)
|
123
|
-
|
124
|
-
if use_cache:
|
125
|
-
if use_compress_kv and not isinstance(past_key_values, DynamicCompressCache):
|
126
|
-
if use_quantize_kv:
|
127
|
-
past_key_values = DynamicCompressFp8Cache.from_legacy_cache(past_key_values)
|
128
|
-
else:
|
129
|
-
past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values)
|
130
|
-
elif use_quantize_kv and not use_compress_kv and not isinstance(past_key_values,
|
131
|
-
DynamicFp8Cache):
|
132
|
-
past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
|
133
|
-
if not use_quantize_kv and not use_compress_kv and not isinstance(past_key_values,
|
134
|
-
DynamicNormalCache):
|
135
|
-
past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values)
|
136
|
-
past_key_values_length = past_key_values.get_usable_length(seq_length)
|
137
|
-
# ipex-llm changes end
|
138
|
-
|
139
|
-
if position_ids is None:
|
140
|
-
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
141
|
-
position_ids = torch.arange(
|
142
|
-
past_key_values_length, seq_length + past_key_values_length,
|
143
|
-
dtype=torch.long, device=device
|
144
|
-
)
|
145
|
-
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
|
146
|
-
else:
|
147
|
-
position_ids = position_ids.view(-1, seq_length).long()
|
148
|
-
|
149
|
-
if inputs_embeds is None:
|
150
|
-
inputs_embeds = self.embed_tokens(input_ids)
|
151
|
-
|
152
|
-
flash_attn_2 = self._attn_implementation == "flash_attention_2"
|
153
|
-
if attention_mask is not None and flash_attn_2 and use_cache:
|
154
|
-
|
155
|
-
is_padding_right = attention_mask[:, -1].sum().item() != batch_size
|
156
|
-
if is_padding_right:
|
157
|
-
invalidInputError(
|
158
|
-
False,
|
159
|
-
"You are attempting to perform batched generation with padding_side='right'"
|
160
|
-
" this may lead to unexpected behaviour for Flash Attention version of Qwen2."
|
161
|
-
" Make sure to call `tokenizer.padding_side = 'left'` before tokenizing "
|
162
|
-
"the input. "
|
163
|
-
)
|
164
|
-
|
165
|
-
from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask_for_sdpa
|
166
|
-
from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask
|
167
|
-
|
168
|
-
# ipex-llm changes start: don't generate `attention_mask` in decode phase
|
169
|
-
if seq_length == 1:
|
170
|
-
attention_mask = None
|
171
|
-
# ipex-llm changes end
|
172
|
-
elif self._attn_implementation == "flash_attention_2":
|
173
|
-
# 2d mask is passed through the layers
|
174
|
-
attention_mask = attention_mask if (attention_mask is not None and
|
175
|
-
0 in attention_mask) else None
|
176
|
-
elif self._attn_implementation == "sdpa" and not output_attentions:
|
177
|
-
# output_attentions=True can not be supported when using SDPA, and we fall back on
|
178
|
-
# the manual implementation that requires a 4D causal mask in all cases.
|
179
|
-
attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
|
180
|
-
attention_mask,
|
181
|
-
(batch_size, seq_length),
|
182
|
-
inputs_embeds,
|
183
|
-
past_key_values_length,
|
184
|
-
)
|
185
|
-
else:
|
186
|
-
# 4d mask is passed through the layers
|
187
|
-
attention_mask = _prepare_4d_causal_attention_mask(
|
188
|
-
attention_mask,
|
189
|
-
(batch_size, seq_length),
|
190
|
-
inputs_embeds,
|
191
|
-
past_key_values_length,
|
192
|
-
sliding_window=self.config.sliding_window,
|
193
|
-
)
|
194
|
-
|
195
|
-
hidden_states = inputs_embeds
|
196
|
-
|
197
|
-
# decoder layers
|
198
|
-
all_hidden_states = () if output_hidden_states else None
|
199
|
-
all_self_attns = () if output_attentions else None
|
200
|
-
next_decoder_cache = None
|
201
|
-
|
202
|
-
for decoder_layer in self.layers:
|
203
|
-
if output_hidden_states:
|
204
|
-
all_hidden_states += (hidden_states,)
|
205
|
-
|
206
|
-
if self.gradient_checkpointing and self.training:
|
207
|
-
layer_outputs = self._gradient_checkpointing_func(
|
208
|
-
decoder_layer.__call__,
|
209
|
-
hidden_states,
|
210
|
-
attention_mask,
|
211
|
-
position_ids,
|
212
|
-
past_key_values,
|
213
|
-
output_attentions,
|
214
|
-
use_cache,
|
215
|
-
)
|
216
|
-
else:
|
217
|
-
# ipex-llm changes
|
218
|
-
curr_device = decoder_layer.input_layernorm.weight.device
|
219
|
-
if attention_mask is not None:
|
220
|
-
attention_mask = attention_mask.to(curr_device)
|
221
|
-
if position_ids is not None:
|
222
|
-
position_ids = position_ids.to(curr_device)
|
223
|
-
# ipex-llm changes end
|
224
|
-
layer_outputs = decoder_layer(
|
225
|
-
hidden_states,
|
226
|
-
attention_mask=attention_mask,
|
227
|
-
position_ids=position_ids,
|
228
|
-
past_key_value=past_key_values,
|
229
|
-
output_attentions=output_attentions,
|
230
|
-
use_cache=use_cache,
|
231
|
-
)
|
232
|
-
|
233
|
-
hidden_states = layer_outputs[0]
|
234
|
-
|
235
|
-
if use_cache:
|
236
|
-
next_decoder_cache = layer_outputs[2 if output_attentions else 1]
|
237
|
-
|
238
|
-
if output_attentions:
|
239
|
-
all_self_attns += (layer_outputs[1],)
|
240
|
-
|
241
|
-
hidden_states = self.norm(hidden_states)
|
242
|
-
|
243
|
-
# add hidden states from the last decoder layer
|
244
|
-
if output_hidden_states:
|
245
|
-
all_hidden_states += (hidden_states,)
|
246
|
-
|
247
|
-
# ipex-llm changes start: remove `to_legacy_cache`
|
248
|
-
next_cache = None
|
249
|
-
if use_cache:
|
250
|
-
next_cache = next_decoder_cache
|
251
|
-
# ipex-llm changes end
|
252
|
-
|
253
|
-
if not return_dict:
|
254
|
-
return tuple(v for v in [hidden_states, next_cache,
|
255
|
-
all_hidden_states, all_self_attns] if v is not None)
|
256
|
-
return BaseModelOutputWithPast(
|
257
|
-
last_hidden_state=hidden_states,
|
258
|
-
past_key_values=next_cache,
|
259
|
-
hidden_states=all_hidden_states,
|
260
|
-
attentions=all_self_attns,
|
261
|
-
)
|
262
|
-
|
263
|
-
|
264
|
-
def qwen2_model_forward_4_42(
|
265
62
|
self,
|
266
63
|
input_ids: torch.LongTensor = None,
|
267
64
|
attention_mask: Optional[torch.Tensor] = None,
|
@@ -274,44 +71,17 @@ def qwen2_model_forward_4_42(
|
|
274
71
|
return_dict: Optional[bool] = None,
|
275
72
|
cache_position: Optional[torch.LongTensor] = None,
|
276
73
|
) -> Union[Tuple, BaseModelOutputWithPast]:
|
277
|
-
|
278
|
-
|
279
|
-
else self.config.output_attentions
|
280
|
-
)
|
281
|
-
output_hidden_states = (
|
282
|
-
output_hidden_states if output_hidden_states is not None
|
283
|
-
else self.config.output_hidden_states
|
284
|
-
)
|
74
|
+
# IPEX-LLM OPT start: kv cache and quantize kv cache
|
75
|
+
inputs = input_ids if input_ids is not None else inputs_embeds
|
285
76
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
77
|
+
use_cache = True if inputs.device.type == "xpu" else use_cache
|
286
78
|
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
(input_ids is None) ^ (inputs_embeds is None),
|
291
|
-
"You cannot specify both input_ids and inputs_embeds at the same time, "
|
292
|
-
"and must specify either one"
|
79
|
+
use_quantize_kv = self.config.hidden_size != 3584 and use_quantize_kv_cache(
|
80
|
+
self.layers[0].mlp.down_proj, inputs,
|
81
|
+
self.config.num_attention_heads, self.config.num_key_value_heads
|
293
82
|
)
|
294
83
|
|
295
|
-
|
296
|
-
if use_cache:
|
297
|
-
logger.warning_once(
|
298
|
-
"`use_cache=True` is incompatible with gradient checkpointing. "
|
299
|
-
"Setting `use_cache=False`..."
|
300
|
-
)
|
301
|
-
use_cache = False
|
302
|
-
|
303
|
-
if inputs_embeds is None:
|
304
|
-
inputs_embeds = self.embed_tokens(input_ids)
|
305
|
-
|
306
|
-
# ipex-llm changes start
|
307
|
-
# IPEX-LLM OPT: kv cache and quantize kv cache
|
308
|
-
num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
|
309
|
-
use_quantize_kv = (
|
310
|
-
self.config.hidden_size != 3584 # disable quantize kv in specific model
|
311
|
-
and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs_embeds,
|
312
|
-
num_heads, num_kv_heads)
|
313
|
-
)
|
314
|
-
use_compress_kv = should_use_compresskv(inputs_embeds, inputs_embeds.shape[1]) or \
|
84
|
+
use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
|
315
85
|
isinstance(past_key_values, DynamicCompressCache)
|
316
86
|
|
317
87
|
if use_cache:
|
@@ -328,79 +98,24 @@ def qwen2_model_forward_4_42(
|
|
328
98
|
past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values)
|
329
99
|
# ipex-llm changes end
|
330
100
|
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
if position_ids is None:
|
337
|
-
position_ids = cache_position.unsqueeze(0)
|
338
|
-
|
339
|
-
causal_mask = self._update_causal_mask(
|
340
|
-
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
|
341
|
-
)
|
342
|
-
|
343
|
-
hidden_states = inputs_embeds
|
344
|
-
|
345
|
-
# decoder layers
|
346
|
-
all_hidden_states = () if output_hidden_states else None
|
347
|
-
all_self_attns = () if output_attentions else None
|
348
|
-
next_decoder_cache = None
|
349
|
-
|
350
|
-
for decoder_layer in self.layers:
|
351
|
-
if output_hidden_states:
|
352
|
-
all_hidden_states += (hidden_states,)
|
353
|
-
|
354
|
-
if self.gradient_checkpointing and self.training:
|
355
|
-
layer_outputs = self._gradient_checkpointing_func(
|
356
|
-
decoder_layer.__call__,
|
357
|
-
hidden_states,
|
358
|
-
causal_mask,
|
359
|
-
position_ids,
|
360
|
-
past_key_values,
|
361
|
-
output_attentions,
|
362
|
-
use_cache,
|
363
|
-
cache_position,
|
364
|
-
)
|
365
|
-
else:
|
366
|
-
layer_outputs = decoder_layer(
|
367
|
-
hidden_states,
|
368
|
-
attention_mask=causal_mask,
|
369
|
-
position_ids=position_ids,
|
370
|
-
past_key_value=past_key_values,
|
371
|
-
output_attentions=output_attentions,
|
372
|
-
use_cache=use_cache,
|
373
|
-
cache_position=cache_position,
|
374
|
-
)
|
375
|
-
|
376
|
-
hidden_states = layer_outputs[0]
|
377
|
-
|
378
|
-
if use_cache:
|
379
|
-
next_decoder_cache = layer_outputs[2 if output_attentions else 1]
|
380
|
-
|
381
|
-
if output_attentions:
|
382
|
-
all_self_attns += (layer_outputs[1],)
|
383
|
-
|
384
|
-
hidden_states = self.norm(hidden_states)
|
385
|
-
|
386
|
-
# add hidden states from the last decoder layer
|
387
|
-
if output_hidden_states:
|
388
|
-
all_hidden_states += (hidden_states,)
|
389
|
-
|
390
|
-
# ipex-llm changes start: remove `to_legacy_cache`
|
391
|
-
next_cache = None
|
392
|
-
if use_cache:
|
393
|
-
next_cache = next_decoder_cache
|
394
|
-
# ipex-llm changes end
|
101
|
+
# `cache_position` is required after transformers 4.42
|
102
|
+
if cache_position is not None:
|
103
|
+
kwargs = {"cache_position": cache_position}
|
104
|
+
else:
|
105
|
+
kwargs = {}
|
395
106
|
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
past_key_values=
|
402
|
-
|
403
|
-
|
107
|
+
return Qwen2Model.forward(
|
108
|
+
self=self,
|
109
|
+
input_ids=input_ids,
|
110
|
+
attention_mask=attention_mask,
|
111
|
+
position_ids=position_ids,
|
112
|
+
past_key_values=past_key_values,
|
113
|
+
inputs_embeds=inputs_embeds,
|
114
|
+
use_cache=use_cache,
|
115
|
+
output_attentions=output_attentions,
|
116
|
+
output_hidden_states=output_hidden_states,
|
117
|
+
return_dict=return_dict,
|
118
|
+
**kwargs
|
404
119
|
)
|
405
120
|
|
406
121
|
|
@@ -272,26 +272,6 @@ def use_xmx(x: torch.Tensor, qtype: int):
|
|
272
272
|
)
|
273
273
|
|
274
274
|
|
275
|
-
def fp16_fusion_check(proj, x, training):
|
276
|
-
# only use fp16 fusion on PVC inference
|
277
|
-
if proj is None:
|
278
|
-
return False
|
279
|
-
if not hasattr(proj, "qtype"):
|
280
|
-
return False
|
281
|
-
if proj.qtype != ggml_tensor_qtype["fp16"]:
|
282
|
-
return False
|
283
|
-
if proj.weight_type != 2:
|
284
|
-
return False
|
285
|
-
if training:
|
286
|
-
return False
|
287
|
-
if x.requires_grad:
|
288
|
-
return False
|
289
|
-
device_type = get_xpu_device_name(x.device)
|
290
|
-
if device_type != "pvc":
|
291
|
-
return False
|
292
|
-
return True
|
293
|
-
|
294
|
-
|
295
275
|
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
296
276
|
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
|
297
277
|
if n_rep == 1:
|
@@ -182,13 +182,17 @@ class _BaseAutoModelClass:
|
|
182
182
|
if hasattr(model, "config") and model.config.model_type == "glm":
|
183
183
|
# convert to llama structure
|
184
184
|
from .npu_models.glm_edge import convert_config, load_weights, convert_state_dict
|
185
|
-
import json
|
186
185
|
original_path = model.config._name_or_path
|
187
186
|
del model
|
188
187
|
|
189
|
-
|
190
|
-
original_config = json.load(f)
|
188
|
+
original_config, _ = PretrainedConfig.get_config_dict(original_path)
|
191
189
|
config = convert_config(original_config)
|
190
|
+
|
191
|
+
if not os.path.isdir(original_path):
|
192
|
+
# all model files are already cached
|
193
|
+
from transformers.utils.hub import cached_file
|
194
|
+
resolved_file = cached_file(original_path, "config.json")
|
195
|
+
original_path = os.path.dirname(resolved_file)
|
192
196
|
original_state_dict = load_weights(original_path)
|
193
197
|
new_dict, _ = convert_state_dict(original_state_dict, config,
|
194
198
|
original_config.get("partial_rotary_factor", 1.0),
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ipex-llm
|
3
|
-
Version: 2.2.
|
3
|
+
Version: 2.2.0b20250111
|
4
4
|
Summary: Large Language Model Develop Toolkit
|
5
5
|
Home-page: https://github.com/intel-analytics/ipex-llm
|
6
6
|
Author: BigDL Authors
|
@@ -27,10 +27,10 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
|
|
27
27
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
|
28
28
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
|
29
29
|
Provides-Extra: cpp
|
30
|
-
Requires-Dist: bigdl-core-cpp ==2.6.
|
30
|
+
Requires-Dist: bigdl-core-cpp ==2.6.0b20250111 ; extra == 'cpp'
|
31
31
|
Requires-Dist: setuptools ; extra == 'cpp'
|
32
32
|
Provides-Extra: cpp-arl
|
33
|
-
Requires-Dist: bigdl-core-cpp ==2.6.
|
33
|
+
Requires-Dist: bigdl-core-cpp ==2.6.0b20250111 ; extra == 'cpp-arl'
|
34
34
|
Requires-Dist: setuptools ; extra == 'cpp-arl'
|
35
35
|
Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
36
36
|
Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
@@ -67,7 +67,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
|
|
67
67
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
|
68
68
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
|
69
69
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
|
70
|
-
Requires-Dist: bigdl-core-npu ==2.6.
|
70
|
+
Requires-Dist: bigdl-core-npu ==2.6.0b20250111 ; (platform_system == "Windows") and extra == 'npu'
|
71
71
|
Provides-Extra: serving
|
72
72
|
Requires-Dist: py-cpuinfo ; extra == 'serving'
|
73
73
|
Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
|
@@ -87,9 +87,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
|
|
87
87
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
|
88
88
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
|
89
89
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
|
90
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
91
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
92
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
90
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250111 ; extra == 'xpu'
|
91
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250111 ; extra == 'xpu'
|
92
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250111 ; extra == 'xpu'
|
93
93
|
Provides-Extra: xpu-2-1
|
94
94
|
Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
|
95
95
|
Requires-Dist: protobuf ; extra == 'xpu-2-1'
|
@@ -104,9 +104,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
|
|
104
104
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
|
105
105
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
|
106
106
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
|
107
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
108
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
109
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
107
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250111 ; extra == 'xpu-2-1'
|
108
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250111 ; extra == 'xpu-2-1'
|
109
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250111 ; extra == 'xpu-2-1'
|
110
110
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
|
111
111
|
Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
112
112
|
Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
@@ -124,7 +124,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
|
|
124
124
|
Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
|
125
125
|
Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
|
126
126
|
Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
|
127
|
-
Requires-Dist: bigdl-core-xe-all ==2.6.
|
127
|
+
Requires-Dist: bigdl-core-xe-all ==2.6.0b20250111 ; extra == 'xpu-2-6'
|
128
128
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-6'
|
129
129
|
Provides-Extra: xpu-arc
|
130
130
|
Requires-Dist: py-cpuinfo ; extra == 'xpu-arc'
|
@@ -137,9 +137,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
|
|
137
137
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
|
138
138
|
Requires-Dist: tabulate ; extra == 'xpu-arc'
|
139
139
|
Requires-Dist: setuptools ; extra == 'xpu-arc'
|
140
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
141
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
142
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
140
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250111 ; extra == 'xpu-arc'
|
141
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250111 ; extra == 'xpu-arc'
|
142
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250111 ; extra == 'xpu-arc'
|
143
143
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
|
144
144
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
145
145
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
@@ -160,9 +160,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
|
|
160
160
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
|
161
161
|
Requires-Dist: tabulate ; extra == 'xpu-arl'
|
162
162
|
Requires-Dist: setuptools ; extra == 'xpu-arl'
|
163
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
164
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
165
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
163
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250111 ; extra == 'xpu-arl'
|
164
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250111 ; extra == 'xpu-arl'
|
165
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250111 ; extra == 'xpu-arl'
|
166
166
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
|
167
167
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
168
168
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
@@ -183,9 +183,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
|
|
183
183
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
|
184
184
|
Requires-Dist: tabulate ; extra == 'xpu-lnl'
|
185
185
|
Requires-Dist: setuptools ; extra == 'xpu-lnl'
|
186
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
187
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
188
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
186
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250111 ; extra == 'xpu-lnl'
|
187
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250111 ; extra == 'xpu-lnl'
|
188
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250111 ; extra == 'xpu-lnl'
|
189
189
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
|
190
190
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|
191
191
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|
@@ -94,17 +94,17 @@ ipex_llm/serving/fastchat/tgi_api_protocol.py,sha256=brT3k3-V0NJrU4fRqUwWjC0O3iO
|
|
94
94
|
ipex_llm/serving/fastchat/tgi_api_server.py,sha256=agNTAEiZPSuj3dEdIdYKwkoY0cXOUDX06DiM9VP2knQ,24418
|
95
95
|
ipex_llm/serving/fastchat/vllm_worker.py,sha256=ZLz2Q9GxJO6r_LOiP6epgCRjBGk-K4EB1SNEWSJp5DA,11091
|
96
96
|
ipex_llm/transformers/__init__.py,sha256=l4KkMkLe-pRC7b_kj6LCfeifgE-Uo33_Av_FwN9HnFA,1074
|
97
|
-
ipex_llm/transformers/convert.py,sha256=
|
98
|
-
ipex_llm/transformers/convert_ipex.py,sha256=
|
97
|
+
ipex_llm/transformers/convert.py,sha256=i2IOmDnQBKNtcfUL95l8w4rNBMiU4SqX_9uz_LtPHMI,98086
|
98
|
+
ipex_llm/transformers/convert_ipex.py,sha256=_nSnUTQy-yfkKaqGdqnBdWztZf3NGmnbZ0TKaDrF4X4,14617
|
99
99
|
ipex_llm/transformers/embedding.py,sha256=bdgk59DvD4ZZyxRzewXOR7g56nThgO6uhIwk8QL7f-s,9299
|
100
100
|
ipex_llm/transformers/kv.py,sha256=k4TU18LlA-Sbq9WNNQnfuzu3RSFBwFhmaV3BcGN5bAo,19191
|
101
101
|
ipex_llm/transformers/lisa.py,sha256=F5WxbtXQ7RdKulj83h_2DnEIgKiKGZf7zvOmg6QBl2s,3289
|
102
102
|
ipex_llm/transformers/loader.py,sha256=AwjV5RpI2t2bedlv7ZhLm8cfd-QJZm5hny-XyjIvdnk,6876
|
103
103
|
ipex_llm/transformers/lookup.py,sha256=b6OlZ9OV10R9qeWw8mVryVpDxszkjwLkldvi7GPMJY8,19614
|
104
|
-
ipex_llm/transformers/low_bit_linear.py,sha256=
|
104
|
+
ipex_llm/transformers/low_bit_linear.py,sha256=QBHrAG7lgOgVO1LHPNlimn8Icm44kEpnWOLtVuHoHDA,40857
|
105
105
|
ipex_llm/transformers/model.py,sha256=fj7LBjrWtWwDJJYXnWiXsLGS4ayqqHfnh0p51dSDssE,40908
|
106
106
|
ipex_llm/transformers/modelling_bigdl.py,sha256=7JpNVMuyq_OmtNUaMFMXdxPWZp2q0QHC02QeA-VTPOw,6709
|
107
|
-
ipex_llm/transformers/npu_model.py,sha256=
|
107
|
+
ipex_llm/transformers/npu_model.py,sha256=X8ZtvZJpzz64XrSPhUYXXZmdJcbZ9X6G3Vlzw-zgN1Q,39749
|
108
108
|
ipex_llm/transformers/patches.py,sha256=halPWm__ORh2fRFSIFPiCNg3LQBfrRkTPtmtRpBJCZQ,1286
|
109
109
|
ipex_llm/transformers/pipeline_parallel.py,sha256=uNZpOXljNmdoEYnP8U-VFiN4dRZb2piQbIf2bG9LQnE,49051
|
110
110
|
ipex_llm/transformers/qlora.py,sha256=jtPGsvWFjbTUGzDBCdfftnCis_0nJQNRpACSwXUbbGU,14943
|
@@ -144,7 +144,7 @@ ipex_llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py,sh
|
|
144
144
|
ipex_llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py,sha256=_AOGMV65XHxgTxIib7lgs49InopcecTzRwgtYR8NTUg,51084
|
145
145
|
ipex_llm/transformers/models/__init__.py,sha256=tp2DcVkKg1-QvdYk7DY7rZvQWCDQ4ZjU8NAQ7Fclrpg,584
|
146
146
|
ipex_llm/transformers/models/aquila.py,sha256=VZb5Drpo_fTxwcExZ397LygnsNPX2sVbie9_JeFudZI,5252
|
147
|
-
ipex_llm/transformers/models/baichuan.py,sha256=
|
147
|
+
ipex_llm/transformers/models/baichuan.py,sha256=8b43mBRZJEf_xLNoodhA4r9x1anqwC3Wt8awWel-aUo,18306
|
148
148
|
ipex_llm/transformers/models/bert.py,sha256=0Mm9jkvkzBxtc_z_GE1TcZoPz-HOg2Z2973ZEWgSwJk,5601
|
149
149
|
ipex_llm/transformers/models/bloom.py,sha256=PxfzyYT-nFn3K5rZhTQjmcEjUUzAhUFzxIN4kzRlCuc,8103
|
150
150
|
ipex_llm/transformers/models/chatglm.py,sha256=UHai1t2AUtGmF765_eHF8LUMVQzp_oCBx8TJB21WrHk,12597
|
@@ -159,7 +159,7 @@ ipex_llm/transformers/models/glm.py,sha256=lmeEWd_W2O638VzVW4Gm6cJre5XZcg_QBmPs8
|
|
159
159
|
ipex_llm/transformers/models/gpt2.py,sha256=YSaNgK1uLCFDuIFqnKO0Mi-AsOZsYav-7pNf_NpKGdM,3445
|
160
160
|
ipex_llm/transformers/models/gptbigcode.py,sha256=cP1_qGWoa43R2WacAMblShjku4QupcCZiLaPPAoOUs4,9101
|
161
161
|
ipex_llm/transformers/models/gptneox.py,sha256=loRh1x_5S6BCeOr_s5xr-N_1SQHL3Y5IiUBAEyoMUqQ,6172
|
162
|
-
ipex_llm/transformers/models/internlm.py,sha256=
|
162
|
+
ipex_llm/transformers/models/internlm.py,sha256=JZFrI2HXsIAfM-6pA2RO0wcXopOliC1FggLMzNzaDZ4,17404
|
163
163
|
ipex_llm/transformers/models/internvl.py,sha256=Vx0vENIEQLX2M6P398mw5TOhpks0U8xf8rtRQvy94go,8154
|
164
164
|
ipex_llm/transformers/models/llama.py,sha256=NzpyQve_RC9ez1W-jWPLGZ80k_S1I5Rx5saAzCsDIoI,8558
|
165
165
|
ipex_llm/transformers/models/minicpm.py,sha256=eaPNVNrep0_xGoELhZd886ff0ceoKqB6cusdAhd52eE,10145
|
@@ -169,10 +169,10 @@ ipex_llm/transformers/models/mistral.py,sha256=uVhkdXaq15v1P3QY0emVsA7SxUbAWChHE
|
|
169
169
|
ipex_llm/transformers/models/mllama.py,sha256=ZyRq9DTKsvk1AlRbr-z6ngjS3Sr_7YuGZ6-Yr1MBBAM,10937
|
170
170
|
ipex_llm/transformers/models/mpt.py,sha256=z02NwHogJZVh-Mk4sYoIzR90SFIKhoNN_-ifsD907TQ,9540
|
171
171
|
ipex_llm/transformers/models/phi.py,sha256=E6qz4EEuHIVGvaPo-wtLC5lz3iyMqTbAE_cRlcjQRKI,6670
|
172
|
-
ipex_llm/transformers/models/phi3.py,sha256=
|
172
|
+
ipex_llm/transformers/models/phi3.py,sha256=AaWB7TPQdrDYgpcVHglG0Q0480bxNOw1mFePddlBEFk,14849
|
173
173
|
ipex_llm/transformers/models/phixtral.py,sha256=MDTMghcu7qAmZmRcUGqXXDXhSU3y_N59HRIXmlcjp5g,4890
|
174
174
|
ipex_llm/transformers/models/qwen.py,sha256=A3WiVCzA7NLkcjp4zhFkZvKZzZWZlg0WFuVV_556TAI,19543
|
175
|
-
ipex_llm/transformers/models/qwen2.py,sha256=
|
175
|
+
ipex_llm/transformers/models/qwen2.py,sha256=zK-FpUaxEhjD4gZa1ZvArodAilz29T_cpeAqfCGosc0,14317
|
176
176
|
ipex_llm/transformers/models/qwen2_moe.py,sha256=a0gYo-ngf8SxaEnBdZUJDnPS6Mkn_poDd8xqhx50icI,19516
|
177
177
|
ipex_llm/transformers/models/qwen2_vl.py,sha256=NrhxlaPj7W-HUBmKc3CSTwZy1lkoZ9qDaxM4GvE0kHs,13583
|
178
178
|
ipex_llm/transformers/models/qwen_vl.py,sha256=j7Nzzz2Qvynu9yrCXmoEfERjw43hXof5TbXIs7Ms-oY,17105
|
@@ -181,7 +181,7 @@ ipex_llm/transformers/models/rwkv5.py,sha256=OkRNj1pCAZg1z2Fw-I0DEnxLEdZyPeRSQ6m
|
|
181
181
|
ipex_llm/transformers/models/sd.py,sha256=VvHV5u-0k2MgHu3NL9113hPj7DgfxqctuKzEEeNfRDU,5981
|
182
182
|
ipex_llm/transformers/models/stablelm.py,sha256=fj-XtOnR6kggnFUQTMPCOOzolkPztN06WAv8QW-XRnI,7054
|
183
183
|
ipex_llm/transformers/models/starcoder2.py,sha256=ONKvD7JCkRM0DI-R56x28QFBJ7CjD5hOZBQ_3WfOcNk,6626
|
184
|
-
ipex_llm/transformers/models/utils.py,sha256=
|
184
|
+
ipex_llm/transformers/models/utils.py,sha256=WYBc26vSiy_CzV07z-eT5ts90Kko2yUmS3DDZtfGcRk,15065
|
185
185
|
ipex_llm/transformers/models/yuan.py,sha256=JYAn_ZaSGK0NBJLEIxCACfAq084a66GFJkdd5NbpmMA,7732
|
186
186
|
ipex_llm/transformers/npu_models/__init__.py,sha256=ulEUGLjaP48LCrVeury3UxLjXxKzRi0UpSG4bYu-7f8,585
|
187
187
|
ipex_llm/transformers/npu_models/baichuan.py,sha256=fJtd7fBrttySghRUgfZTAdxLjsSNC-XL08HISsXigLE,4685
|
@@ -250,11 +250,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
|
|
250
250
|
ipex_llm/vllm/xpu/engine/engine.py,sha256=k4-D27WS_Gk3mA--w3HWAjPjb4Aiu043MVPi0ZoAUBc,5984
|
251
251
|
ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=GshTZFB8e4PWvqckfbmTOU6b0oLkNn7A-vzLuG9--j8,21544
|
252
252
|
ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=2rENA2ucynMaIjiZBEh2ez1o5vR32GaP514t39CD7KM,8676
|
253
|
-
ipex_llm-2.2.
|
254
|
-
ipex_llm-2.2.
|
255
|
-
ipex_llm-2.2.
|
256
|
-
ipex_llm-2.2.
|
257
|
-
ipex_llm-2.2.
|
258
|
-
ipex_llm-2.2.
|
259
|
-
ipex_llm-2.2.
|
260
|
-
ipex_llm-2.2.
|
253
|
+
ipex_llm-2.2.0b20250111.data/scripts/ipex-llm-init,sha256=fLQsT2dRL6H5bThb4GuIWotAuqoLsIxFwA-0c2qmaO8,6672
|
254
|
+
ipex_llm-2.2.0b20250111.data/scripts/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
|
255
|
+
ipex_llm-2.2.0b20250111.data/scripts/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
|
256
|
+
ipex_llm-2.2.0b20250111.dist-info/METADATA,sha256=8HtPWBsOYbGoboTzVcl5ygPez_bfjvEzqxWanbYhx_o,12705
|
257
|
+
ipex_llm-2.2.0b20250111.dist-info/WHEEL,sha256=PPJcBMAZibF_2GFE9NmOJGqiaSMPiNFbJd6QaJjdA6Y,109
|
258
|
+
ipex_llm-2.2.0b20250111.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
|
259
|
+
ipex_llm-2.2.0b20250111.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
|
260
|
+
ipex_llm-2.2.0b20250111.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|