ipex-llm 2.2.0b20250108__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250110__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. ipex_llm/transformers/convert.py +9 -22
  2. ipex_llm/transformers/convert_ipex.py +8 -1
  3. ipex_llm/transformers/low_bit_linear.py +5 -5
  4. ipex_llm/transformers/models/baichuan.py +8 -38
  5. ipex_llm/transformers/models/bert.py +2 -13
  6. ipex_llm/transformers/models/chatglm2.py +8 -31
  7. ipex_llm/transformers/models/chatglm4.py +9 -4
  8. ipex_llm/transformers/models/chatglm4v.py +1 -1
  9. ipex_llm/transformers/models/common.py +3 -1
  10. ipex_llm/transformers/models/glm.py +1 -1
  11. ipex_llm/transformers/models/internlm.py +6 -18
  12. ipex_llm/transformers/models/llama.py +1 -1
  13. ipex_llm/transformers/models/minicpm.py +1 -1
  14. ipex_llm/transformers/models/minicpm3.py +3 -1
  15. ipex_llm/transformers/models/mistral.py +1 -1
  16. ipex_llm/transformers/models/mllama.py +1 -1
  17. ipex_llm/transformers/models/phi3.py +8 -21
  18. ipex_llm/transformers/models/qwen.py +4 -2
  19. ipex_llm/transformers/models/qwen2.py +25 -309
  20. ipex_llm/transformers/models/qwen2_moe.py +4 -2
  21. ipex_llm/transformers/models/qwen2_vl.py +3 -1
  22. ipex_llm/transformers/models/stablelm.py +3 -1
  23. ipex_llm/transformers/models/starcoder2.py +3 -1
  24. ipex_llm/transformers/models/utils.py +7 -23
  25. ipex_llm/transformers/models/yuan.py +2 -1
  26. ipex_llm/transformers/npu_model.py +7 -3
  27. {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250110.dist-info}/METADATA +20 -20
  28. {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250110.dist-info}/RECORD +34 -34
  29. {ipex_llm-2.2.0b20250108.data → ipex_llm-2.2.0b20250110.data}/scripts/ipex-llm-init +0 -0
  30. {ipex_llm-2.2.0b20250108.data → ipex_llm-2.2.0b20250110.data}/scripts/llm-chat +0 -0
  31. {ipex_llm-2.2.0b20250108.data → ipex_llm-2.2.0b20250110.data}/scripts/llm-cli +0 -0
  32. {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250110.dist-info}/WHEEL +0 -0
  33. {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250110.dist-info}/entry_points.txt +0 -0
  34. {ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250110.dist-info}/top_level.txt +0 -0
@@ -51,217 +51,14 @@ from ipex_llm.transformers.models.utils import use_quantize_kv_cache, \
51
51
  should_use_compresskv, is_enough_kv_cache_room_4_36
52
52
  from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache, \
53
53
  DynamicCompressCache, DynamicCompressFp8Cache
54
- from ipex_llm.utils.common import invalidInputError
55
54
 
56
- from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2MLP
55
+ from transformers.models.qwen2.modeling_qwen2 import Qwen2Model, Qwen2Attention, Qwen2MLP
57
56
  from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb
58
57
  from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
59
58
  from transformers.cache_utils import Cache
60
- from transformers import logging
61
-
62
-
63
- logger = logging.get_logger(__name__)
64
59
 
65
60
 
66
61
  def qwen2_model_forward(
67
- self,
68
- input_ids: torch.LongTensor = None,
69
- attention_mask: Optional[torch.Tensor] = None,
70
- position_ids: Optional[torch.LongTensor] = None,
71
- past_key_values: Optional[List[torch.FloatTensor]] = None,
72
- inputs_embeds: Optional[torch.FloatTensor] = None,
73
- use_cache: Optional[bool] = None,
74
- output_attentions: Optional[bool] = None,
75
- output_hidden_states: Optional[bool] = None,
76
- return_dict: Optional[bool] = None,
77
- cache_position: Optional[torch.LongTensor] = None, # for transformers >= 4.42
78
- ) -> Union[Tuple, BaseModelOutputWithPast]:
79
- output_attentions = (
80
- output_attentions if output_attentions is not None
81
- else self.config.output_attentions
82
- )
83
- output_hidden_states = (
84
- output_hidden_states if output_hidden_states is not None
85
- else self.config.output_hidden_states
86
- )
87
- use_cache = use_cache if use_cache is not None else self.config.use_cache
88
-
89
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
90
-
91
- # retrieve input_ids and inputs_embeds
92
- if input_ids is not None and inputs_embeds is not None:
93
- invalidInputError(False,
94
- "You cannot specify both input_ids and inputs_embeds at the same time")
95
- elif input_ids is not None:
96
- batch_size, seq_length = input_ids.shape
97
- elif inputs_embeds is not None:
98
- batch_size, seq_length, _ = inputs_embeds.shape
99
- else:
100
- invalidInputError(False,
101
- "You have to specify either decoder_input_ids or decoder_inputs_embeds")
102
-
103
- if self.gradient_checkpointing and self.training:
104
- if use_cache:
105
- logger.warning_once(
106
- "`use_cache=True` is incompatible with gradient checkpointing. "
107
- "Setting `use_cache=False`..."
108
- )
109
- use_cache = False
110
-
111
- past_key_values_length = 0
112
-
113
- # ipex-llm changes start
114
- # IPEX-LLM OPT: kv cache and quantize kv cache
115
- inputs = input_ids if input_ids is not None else inputs_embeds
116
- use_quantize_kv = (
117
- self.config.hidden_size != 3584 # disable quantize kv in specific model
118
- and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
119
- self.config.num_attention_heads//self.config.num_key_value_heads)
120
- )
121
- use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
122
- isinstance(past_key_values, DynamicCompressCache)
123
-
124
- if use_cache:
125
- if use_compress_kv and not isinstance(past_key_values, DynamicCompressCache):
126
- if use_quantize_kv:
127
- past_key_values = DynamicCompressFp8Cache.from_legacy_cache(past_key_values)
128
- else:
129
- past_key_values = DynamicCompressCache.from_legacy_cache(past_key_values)
130
- elif use_quantize_kv and not use_compress_kv and not isinstance(past_key_values,
131
- DynamicFp8Cache):
132
- past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
133
- if not use_quantize_kv and not use_compress_kv and not isinstance(past_key_values,
134
- DynamicNormalCache):
135
- past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values)
136
- past_key_values_length = past_key_values.get_usable_length(seq_length)
137
- # ipex-llm changes end
138
-
139
- if position_ids is None:
140
- device = input_ids.device if input_ids is not None else inputs_embeds.device
141
- position_ids = torch.arange(
142
- past_key_values_length, seq_length + past_key_values_length,
143
- dtype=torch.long, device=device
144
- )
145
- position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
146
- else:
147
- position_ids = position_ids.view(-1, seq_length).long()
148
-
149
- if inputs_embeds is None:
150
- inputs_embeds = self.embed_tokens(input_ids)
151
-
152
- flash_attn_2 = self._attn_implementation == "flash_attention_2"
153
- if attention_mask is not None and flash_attn_2 and use_cache:
154
-
155
- is_padding_right = attention_mask[:, -1].sum().item() != batch_size
156
- if is_padding_right:
157
- invalidInputError(
158
- False,
159
- "You are attempting to perform batched generation with padding_side='right'"
160
- " this may lead to unexpected behaviour for Flash Attention version of Qwen2."
161
- " Make sure to call `tokenizer.padding_side = 'left'` before tokenizing "
162
- "the input. "
163
- )
164
-
165
- from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask_for_sdpa
166
- from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask
167
-
168
- # ipex-llm changes start: don't generate `attention_mask` in decode phase
169
- if seq_length == 1:
170
- attention_mask = None
171
- # ipex-llm changes end
172
- elif self._attn_implementation == "flash_attention_2":
173
- # 2d mask is passed through the layers
174
- attention_mask = attention_mask if (attention_mask is not None and
175
- 0 in attention_mask) else None
176
- elif self._attn_implementation == "sdpa" and not output_attentions:
177
- # output_attentions=True can not be supported when using SDPA, and we fall back on
178
- # the manual implementation that requires a 4D causal mask in all cases.
179
- attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
180
- attention_mask,
181
- (batch_size, seq_length),
182
- inputs_embeds,
183
- past_key_values_length,
184
- )
185
- else:
186
- # 4d mask is passed through the layers
187
- attention_mask = _prepare_4d_causal_attention_mask(
188
- attention_mask,
189
- (batch_size, seq_length),
190
- inputs_embeds,
191
- past_key_values_length,
192
- sliding_window=self.config.sliding_window,
193
- )
194
-
195
- hidden_states = inputs_embeds
196
-
197
- # decoder layers
198
- all_hidden_states = () if output_hidden_states else None
199
- all_self_attns = () if output_attentions else None
200
- next_decoder_cache = None
201
-
202
- for decoder_layer in self.layers:
203
- if output_hidden_states:
204
- all_hidden_states += (hidden_states,)
205
-
206
- if self.gradient_checkpointing and self.training:
207
- layer_outputs = self._gradient_checkpointing_func(
208
- decoder_layer.__call__,
209
- hidden_states,
210
- attention_mask,
211
- position_ids,
212
- past_key_values,
213
- output_attentions,
214
- use_cache,
215
- )
216
- else:
217
- # ipex-llm changes
218
- curr_device = decoder_layer.input_layernorm.weight.device
219
- if attention_mask is not None:
220
- attention_mask = attention_mask.to(curr_device)
221
- if position_ids is not None:
222
- position_ids = position_ids.to(curr_device)
223
- # ipex-llm changes end
224
- layer_outputs = decoder_layer(
225
- hidden_states,
226
- attention_mask=attention_mask,
227
- position_ids=position_ids,
228
- past_key_value=past_key_values,
229
- output_attentions=output_attentions,
230
- use_cache=use_cache,
231
- )
232
-
233
- hidden_states = layer_outputs[0]
234
-
235
- if use_cache:
236
- next_decoder_cache = layer_outputs[2 if output_attentions else 1]
237
-
238
- if output_attentions:
239
- all_self_attns += (layer_outputs[1],)
240
-
241
- hidden_states = self.norm(hidden_states)
242
-
243
- # add hidden states from the last decoder layer
244
- if output_hidden_states:
245
- all_hidden_states += (hidden_states,)
246
-
247
- # ipex-llm changes start: remove `to_legacy_cache`
248
- next_cache = None
249
- if use_cache:
250
- next_cache = next_decoder_cache
251
- # ipex-llm changes end
252
-
253
- if not return_dict:
254
- return tuple(v for v in [hidden_states, next_cache,
255
- all_hidden_states, all_self_attns] if v is not None)
256
- return BaseModelOutputWithPast(
257
- last_hidden_state=hidden_states,
258
- past_key_values=next_cache,
259
- hidden_states=all_hidden_states,
260
- attentions=all_self_attns,
261
- )
262
-
263
-
264
- def qwen2_model_forward_4_42(
265
62
  self,
266
63
  input_ids: torch.LongTensor = None,
267
64
  attention_mask: Optional[torch.Tensor] = None,
@@ -274,43 +71,17 @@ def qwen2_model_forward_4_42(
274
71
  return_dict: Optional[bool] = None,
275
72
  cache_position: Optional[torch.LongTensor] = None,
276
73
  ) -> Union[Tuple, BaseModelOutputWithPast]:
277
- output_attentions = (
278
- output_attentions if output_attentions is not None
279
- else self.config.output_attentions
280
- )
281
- output_hidden_states = (
282
- output_hidden_states if output_hidden_states is not None
283
- else self.config.output_hidden_states
284
- )
74
+ # IPEX-LLM OPT start: kv cache and quantize kv cache
75
+ inputs = input_ids if input_ids is not None else inputs_embeds
285
76
  use_cache = use_cache if use_cache is not None else self.config.use_cache
77
+ use_cache = True if inputs.device.type == "xpu" else use_cache
286
78
 
287
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
288
-
289
- invalidInputError(
290
- (input_ids is None) ^ (inputs_embeds is None),
291
- "You cannot specify both input_ids and inputs_embeds at the same time, "
292
- "and must specify either one"
79
+ use_quantize_kv = self.config.hidden_size != 3584 and use_quantize_kv_cache(
80
+ self.layers[0].mlp.down_proj, inputs,
81
+ self.config.num_attention_heads, self.config.num_key_value_heads
293
82
  )
294
83
 
295
- if self.gradient_checkpointing and self.training:
296
- if use_cache:
297
- logger.warning_once(
298
- "`use_cache=True` is incompatible with gradient checkpointing. "
299
- "Setting `use_cache=False`..."
300
- )
301
- use_cache = False
302
-
303
- if inputs_embeds is None:
304
- inputs_embeds = self.embed_tokens(input_ids)
305
-
306
- # ipex-llm changes start
307
- # IPEX-LLM OPT: kv cache and quantize kv cache
308
- use_quantize_kv = (
309
- self.config.hidden_size != 3584 # disable quantize kv in specific model
310
- and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs_embeds,
311
- self.config.num_attention_heads//self.config.num_key_value_heads)
312
- )
313
- use_compress_kv = should_use_compresskv(inputs_embeds, inputs_embeds.shape[1]) or \
84
+ use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
314
85
  isinstance(past_key_values, DynamicCompressCache)
315
86
 
316
87
  if use_cache:
@@ -327,79 +98,24 @@ def qwen2_model_forward_4_42(
327
98
  past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values)
328
99
  # ipex-llm changes end
329
100
 
330
- if cache_position is None:
331
- past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
332
- cache_position = torch.arange(
333
- past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
334
- )
335
- if position_ids is None:
336
- position_ids = cache_position.unsqueeze(0)
337
-
338
- causal_mask = self._update_causal_mask(
339
- attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
340
- )
341
-
342
- hidden_states = inputs_embeds
343
-
344
- # decoder layers
345
- all_hidden_states = () if output_hidden_states else None
346
- all_self_attns = () if output_attentions else None
347
- next_decoder_cache = None
348
-
349
- for decoder_layer in self.layers:
350
- if output_hidden_states:
351
- all_hidden_states += (hidden_states,)
352
-
353
- if self.gradient_checkpointing and self.training:
354
- layer_outputs = self._gradient_checkpointing_func(
355
- decoder_layer.__call__,
356
- hidden_states,
357
- causal_mask,
358
- position_ids,
359
- past_key_values,
360
- output_attentions,
361
- use_cache,
362
- cache_position,
363
- )
364
- else:
365
- layer_outputs = decoder_layer(
366
- hidden_states,
367
- attention_mask=causal_mask,
368
- position_ids=position_ids,
369
- past_key_value=past_key_values,
370
- output_attentions=output_attentions,
371
- use_cache=use_cache,
372
- cache_position=cache_position,
373
- )
374
-
375
- hidden_states = layer_outputs[0]
376
-
377
- if use_cache:
378
- next_decoder_cache = layer_outputs[2 if output_attentions else 1]
379
-
380
- if output_attentions:
381
- all_self_attns += (layer_outputs[1],)
382
-
383
- hidden_states = self.norm(hidden_states)
384
-
385
- # add hidden states from the last decoder layer
386
- if output_hidden_states:
387
- all_hidden_states += (hidden_states,)
388
-
389
- # ipex-llm changes start: remove `to_legacy_cache`
390
- next_cache = None
391
- if use_cache:
392
- next_cache = next_decoder_cache
393
- # ipex-llm changes end
101
+ # `cache_position` is required after transformers 4.42
102
+ if cache_position is not None:
103
+ kwargs = {"cache_position": cache_position}
104
+ else:
105
+ kwargs = {}
394
106
 
395
- if not return_dict:
396
- return tuple(v for v in [hidden_states, next_cache,
397
- all_hidden_states, all_self_attns] if v is not None)
398
- return BaseModelOutputWithPast(
399
- last_hidden_state=hidden_states,
400
- past_key_values=next_cache,
401
- hidden_states=all_hidden_states,
402
- attentions=all_self_attns,
107
+ return Qwen2Model.forward(
108
+ self=self,
109
+ input_ids=input_ids,
110
+ attention_mask=attention_mask,
111
+ position_ids=position_ids,
112
+ past_key_values=past_key_values,
113
+ inputs_embeds=inputs_embeds,
114
+ use_cache=use_cache,
115
+ output_attentions=output_attentions,
116
+ output_hidden_states=output_hidden_states,
117
+ return_dict=return_dict,
118
+ **kwargs
403
119
  )
404
120
 
405
121
 
@@ -73,8 +73,10 @@ def qwen2moe_model_forward(
73
73
  return_dict: Optional[bool] = None,
74
74
  ):
75
75
  use_cache = use_cache if use_cache is not None else self.config.use_cache
76
- input = input_ids if input_ids is not None else inputs_embeds
77
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, input)
76
+ inputs = input_ids if input_ids is not None else inputs_embeds
77
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
78
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, inputs,
79
+ num_heads, num_kv_heads)
78
80
  if use_cache:
79
81
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
80
82
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -88,7 +88,9 @@ def qwen2_vl_model_forward(
88
88
  # IPEX-LLM OPT start: kv cache and quantize kv cache
89
89
  inputs = input_ids if input_ids is not None else inputs_embeds
90
90
  use_cache = True if inputs.device.type == "xpu" else use_cache
91
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
91
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
92
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
93
+ num_heads, num_kv_heads)
92
94
  if use_cache:
93
95
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
94
96
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -69,8 +69,10 @@ def stablelm_model_forward(
69
69
  ):
70
70
  # IPEX-LLM OPT: kv cache and quantize kv cache
71
71
  use_cache = use_cache if use_cache is not None else self.config.use_cache
72
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
72
73
  use_quantize_kv = (self.layers[0].self_attn.head_dim in [64, 80, 96, 128]
73
- and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids))
74
+ and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids,
75
+ num_heads, num_kv_heads))
74
76
  if use_cache:
75
77
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
76
78
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -132,7 +132,9 @@ def model_forward(
132
132
  return_dict: Optional[bool] = None,
133
133
  ):
134
134
  use_cache = use_cache if use_cache is not None else self.config.use_cache
135
- use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.c_fc, input_ids)
135
+ num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
136
+ use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.c_fc, input_ids,
137
+ num_heads, num_kv_heads)
136
138
  if use_cache:
137
139
  if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
138
140
  past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)
@@ -74,7 +74,8 @@ def append_kv_cache(cache_k, cache_v, key_states, value_states):
74
74
  return new_cache_k, new_cache_v
75
75
 
76
76
 
77
- def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: int = 1) -> bool:
77
+ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
78
+ num_heads: int, num_kv_heads: int) -> bool:
78
79
  if os.environ.get("BIGDL_QUANTIZE_KV_CACHE", None) is not None:
79
80
  warnings.warn(
80
81
  "`BIGDL_QUANTIZE_KV_CACHE` is deprecated and will be removed in future releases. "
@@ -90,8 +91,11 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: in
90
91
  else:
91
92
  device_name = get_xpu_device_name(x.device)
92
93
  return (
93
- device_name in ["mtl", "lnl", "arl"] and kv_group == 1
94
- or device_name in ["arc", "bmg"] and x.size(0) > 1
94
+ num_kv_heads >= 4
95
+ and (
96
+ device_name in ["mtl", "lnl", "arl"] and num_heads // num_kv_heads <= 4
97
+ or device_name in ["arc", "bmg"] and x.size(0) > 1
98
+ )
95
99
  )
96
100
 
97
101
 
@@ -268,26 +272,6 @@ def use_xmx(x: torch.Tensor, qtype: int):
268
272
  )
269
273
 
270
274
 
271
- def fp16_fusion_check(proj, x, training):
272
- # only use fp16 fusion on PVC inference
273
- if proj is None:
274
- return False
275
- if not hasattr(proj, "qtype"):
276
- return False
277
- if proj.qtype != ggml_tensor_qtype["fp16"]:
278
- return False
279
- if proj.weight_type != 2:
280
- return False
281
- if training:
282
- return False
283
- if x.requires_grad:
284
- return False
285
- device_type = get_xpu_device_name(x.device)
286
- if device_type != "pvc":
287
- return False
288
- return True
289
-
290
-
291
275
  def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
292
276
  batch, num_key_value_heads, slen, head_dim = hidden_states.shape
293
277
  if n_rep == 1:
@@ -158,7 +158,8 @@ def yuan_attention_forward(
158
158
  "yuan")
159
159
 
160
160
  # IPEX-LLM OPT: kv cache and quantzie kv cache
161
- use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states)
161
+ use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states,
162
+ self.num_heads, self.num_heads)
162
163
  key_states, value_states = update_past_key_value(
163
164
  None if past_key_value is None else (past_key_value[0], past_key_value[1]),
164
165
  key_states, value_states,
@@ -182,13 +182,17 @@ class _BaseAutoModelClass:
182
182
  if hasattr(model, "config") and model.config.model_type == "glm":
183
183
  # convert to llama structure
184
184
  from .npu_models.glm_edge import convert_config, load_weights, convert_state_dict
185
- import json
186
185
  original_path = model.config._name_or_path
187
186
  del model
188
187
 
189
- with open(os.path.join(original_path, "config.json")) as f:
190
- original_config = json.load(f)
188
+ original_config, _ = PretrainedConfig.get_config_dict(original_path)
191
189
  config = convert_config(original_config)
190
+
191
+ if not os.path.isdir(original_path):
192
+ # all model files are already cached
193
+ from transformers.utils.hub import cached_file
194
+ resolved_file = cached_file(original_path, "config.json")
195
+ original_path = os.path.dirname(resolved_file)
192
196
  original_state_dict = load_weights(original_path)
193
197
  new_dict, _ = convert_state_dict(original_state_dict, config,
194
198
  original_config.get("partial_rotary_factor", 1.0),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ipex-llm
3
- Version: 2.2.0b20250108
3
+ Version: 2.2.0b20250110
4
4
  Summary: Large Language Model Develop Toolkit
5
5
  Home-page: https://github.com/intel-analytics/ipex-llm
6
6
  Author: BigDL Authors
@@ -27,10 +27,10 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
27
27
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
28
28
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
29
29
  Provides-Extra: cpp
30
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250108 ; extra == 'cpp'
30
+ Requires-Dist: bigdl-core-cpp ==2.6.0b20250110 ; extra == 'cpp'
31
31
  Requires-Dist: setuptools ; extra == 'cpp'
32
32
  Provides-Extra: cpp-arl
33
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250108 ; extra == 'cpp-arl'
33
+ Requires-Dist: bigdl-core-cpp ==2.6.0b20250110 ; extra == 'cpp-arl'
34
34
  Requires-Dist: setuptools ; extra == 'cpp-arl'
35
35
  Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
36
36
  Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
@@ -67,7 +67,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
67
67
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
68
68
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
69
69
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
70
- Requires-Dist: bigdl-core-npu ==2.6.0b20250108 ; (platform_system == "Windows") and extra == 'npu'
70
+ Requires-Dist: bigdl-core-npu ==2.6.0b20250110 ; (platform_system == "Windows") and extra == 'npu'
71
71
  Provides-Extra: serving
72
72
  Requires-Dist: py-cpuinfo ; extra == 'serving'
73
73
  Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -87,9 +87,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
87
87
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
88
88
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
89
89
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
90
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250108 ; extra == 'xpu'
91
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250108 ; extra == 'xpu'
92
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250108 ; extra == 'xpu'
90
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250110 ; extra == 'xpu'
91
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250110 ; extra == 'xpu'
92
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250110 ; extra == 'xpu'
93
93
  Provides-Extra: xpu-2-1
94
94
  Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
95
95
  Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -104,9 +104,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
104
104
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
105
105
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
106
106
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
107
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250108 ; extra == 'xpu-2-1'
108
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250108 ; extra == 'xpu-2-1'
109
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250108 ; extra == 'xpu-2-1'
107
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250110 ; extra == 'xpu-2-1'
108
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250110 ; extra == 'xpu-2-1'
109
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250110 ; extra == 'xpu-2-1'
110
110
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
111
111
  Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
112
112
  Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -124,7 +124,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
124
124
  Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
125
125
  Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
126
126
  Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
127
- Requires-Dist: bigdl-core-xe-all ==2.6.0b20250108 ; extra == 'xpu-2-6'
127
+ Requires-Dist: bigdl-core-xe-all ==2.6.0b20250110 ; extra == 'xpu-2-6'
128
128
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-6'
129
129
  Provides-Extra: xpu-arc
130
130
  Requires-Dist: py-cpuinfo ; extra == 'xpu-arc'
@@ -137,9 +137,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
137
137
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
138
138
  Requires-Dist: tabulate ; extra == 'xpu-arc'
139
139
  Requires-Dist: setuptools ; extra == 'xpu-arc'
140
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250108 ; extra == 'xpu-arc'
141
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250108 ; extra == 'xpu-arc'
142
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250108 ; extra == 'xpu-arc'
140
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250110 ; extra == 'xpu-arc'
141
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250110 ; extra == 'xpu-arc'
142
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250110 ; extra == 'xpu-arc'
143
143
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
144
144
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
145
145
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -160,9 +160,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
160
160
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
161
161
  Requires-Dist: tabulate ; extra == 'xpu-arl'
162
162
  Requires-Dist: setuptools ; extra == 'xpu-arl'
163
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250108 ; extra == 'xpu-arl'
164
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250108 ; extra == 'xpu-arl'
165
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250108 ; extra == 'xpu-arl'
163
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250110 ; extra == 'xpu-arl'
164
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250110 ; extra == 'xpu-arl'
165
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250110 ; extra == 'xpu-arl'
166
166
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
167
167
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
168
168
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -183,9 +183,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
183
183
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
184
184
  Requires-Dist: tabulate ; extra == 'xpu-lnl'
185
185
  Requires-Dist: setuptools ; extra == 'xpu-lnl'
186
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250108 ; extra == 'xpu-lnl'
187
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250108 ; extra == 'xpu-lnl'
188
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250108 ; extra == 'xpu-lnl'
186
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250110 ; extra == 'xpu-lnl'
187
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250110 ; extra == 'xpu-lnl'
188
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250110 ; extra == 'xpu-lnl'
189
189
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
190
190
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
191
191
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'