ipex-llm 2.2.0b20250101__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250102__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ipex_llm/optimize.py CHANGED
@@ -254,7 +254,9 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_
254
254
  torch_dtype=torch_dtype,
255
255
  optimize_model=optimize_llm,
256
256
  modules_to_not_convert=modules_to_not_convert,
257
- cpu_embedding=cpu_embedding)
257
+ cpu_embedding=cpu_embedding,
258
+ disable_optimize_pre=kwargs.pop("disable_optimize_pre",
259
+ False))
258
260
  # add save_low_bit to pretrained model dynamically
259
261
  import types
260
262
  model._bigdl_config = dict()
@@ -1081,7 +1081,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
1081
1081
  torch_dtype="auto",
1082
1082
  imatrix_data=None,
1083
1083
  embedding_qtype=None,
1084
- mixed_precision=False):
1084
+ mixed_precision=False,
1085
+ disable_optimize_pre=False):
1085
1086
  if qtype in ggml_tensor_qtype.values():
1086
1087
  index = list(ggml_tensor_qtype.values()).index(qtype)
1087
1088
  logger.info(f"Converting the current model to "
@@ -1104,7 +1105,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
1104
1105
  model = _optimize_ipex(model, qtype)
1105
1106
  return model
1106
1107
 
1107
- if optimize_model:
1108
+ if optimize_model and not disable_optimize_pre:
1108
1109
  model = _optimize_pre(model, qtype)
1109
1110
 
1110
1111
  act_order = False
@@ -764,6 +764,7 @@ class FP16Linear(nn.Linear):
764
764
  # weigh_type = 3 means weight has been transposed by esimd method
765
765
  self.weight_type = 1
766
766
  self.optimize_lm_head = optimize_lm_head
767
+ self.disable_fp16_opt = False
767
768
 
768
769
  def forward(self, x: torch.Tensor):
769
770
  # only work for GPU
@@ -779,8 +780,11 @@ class FP16Linear(nn.Linear):
779
780
  self.weight.data = self.weight.data.to(x.dtype)
780
781
 
781
782
  if not self.use_esimd_kernel(x):
782
- if get_ipex_version() < "2.1.10+xpu" \
783
- or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]:
783
+ if (
784
+ get_ipex_version() < "2.1.10+xpu"
785
+ or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]
786
+ or self.disable_fp16_opt
787
+ ):
784
788
  if self.weight_type == 2:
785
789
  self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
786
790
  requires_grad=False)
@@ -845,6 +849,8 @@ class FP16Linear(nn.Linear):
845
849
 
846
850
  def use_esimd_kernel(self, x):
847
851
  gpu_type = get_xpu_device_type(x)
852
+ if self.disable_fp16_opt:
853
+ return False
848
854
  # esimd kernel can only be used for Arc and Flex
849
855
  if gpu_type not in ["arc", "flex"]:
850
856
  return False
@@ -445,6 +445,7 @@ class _BaseAutoModelClass:
445
445
  mixed_precision = kwargs.pop("mixed_precision", False)
446
446
  if embedding_qtype is not None:
447
447
  embedding_qtype = ggml_tensor_qtype[embedding_qtype]
448
+ disable_optimize_pre = kwargs.pop("disable_optimize_pre", False)
448
449
  _args = copy.deepcopy(args)
449
450
  _kwargs = copy.deepcopy(kwargs)
450
451
  awq_config = None
@@ -513,7 +514,8 @@ class _BaseAutoModelClass:
513
514
  torch_dtype=kwargs.get("torch_dtype", 'auto'),
514
515
  imatrix_data=imatrix_data,
515
516
  embedding_qtype=embedding_qtype,
516
- mixed_precision=mixed_precision)
517
+ mixed_precision=mixed_precision,
518
+ disable_optimize_pre=disable_optimize_pre)
517
519
 
518
520
  if disk_embedding:
519
521
  from ipex_llm.transformers.embedding import DiskEmbedding
@@ -29,7 +29,7 @@ from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp
29
29
  should_use_compresskv
30
30
  from ipex_llm.transformers.models.utils import update_past_key_value
31
31
  from ipex_llm.transformers.models.utils import should_use_fuse_rope
32
- from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp
32
+ from ipex_llm.transformers.models.utils import use_sdp
33
33
  from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, SILU
34
34
  from ipex_llm.transformers.models.utils import mlp_fusion_check
35
35
  from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_36
@@ -301,16 +301,10 @@ def baichuan_attention_forward_7b(
301
301
 
302
302
  # IPEX-LLM OPT: sdp
303
303
  attn_weights = None
304
- if use_flash_attention(query_states, key_states, attention_mask):
305
- attn_output = F.scaled_dot_product_attention(query_states.to(dtype=torch.float16),
306
- key_states.to(dtype=torch.float16),
307
- value_states.to(dtype=torch.float16),
308
- is_causal=True).to(hidden_states.dtype)
309
- else:
310
- attn_output = scaled_dot_product_attention(
311
- query_states, key_states, value_states,
312
- attention_mask, q_len == kv_seq_len
313
- )
304
+ attn_output = scaled_dot_product_attention(
305
+ query_states, key_states, value_states,
306
+ attention_mask, q_len == kv_seq_len
307
+ )
314
308
 
315
309
  attn_output = attn_output.transpose(1, 2).contiguous()
316
310
  attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
@@ -23,7 +23,7 @@ import torch.utils.checkpoint
23
23
  import torch.nn.functional as F
24
24
  from typing import Optional, Tuple
25
25
  from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
26
- from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp
26
+ from ipex_llm.transformers.models.utils import use_sdp
27
27
 
28
28
 
29
29
  def rotate_half(x):
@@ -41,7 +41,7 @@ def apply_rotary_pos_emb_index(q, k, cos, sin, position_id):
41
41
 
42
42
 
43
43
  def glm_sdpa(query, key, value, attention_mask=None, is_causal=False):
44
- if use_flash_attention(query, key, attention_mask) or query.device.type == 'cpu':
44
+ if query.device.type == 'cpu':
45
45
  context_layer = F.scaled_dot_product_attention(query.to(key.dtype),
46
46
  key,
47
47
  value,
@@ -33,7 +33,6 @@ from ipex_llm.transformers.models.utils import update_past_key_value, should_use
33
33
  from ipex_llm.transformers.models.utils import use_quantize_kv_cache
34
34
  from ipex_llm.transformers.models.utils import rotate_half, SILU
35
35
  from ipex_llm.transformers.models.utils import mlp_fusion_check
36
- from ipex_llm.transformers.models.utils import use_flash_attention
37
36
  from ipex_llm.utils.common import invalidInputError
38
37
  from transformers.modeling_outputs import BaseModelOutputWithPast
39
38
 
@@ -116,33 +115,28 @@ def qwen_attention_forward(
116
115
  past_key_value = (key_states.transpose(1, 2),
117
116
  value_states.transpose(1, 2)) if use_cache else None
118
117
 
119
- # IPEX-LLM OPT: sdp
118
+ # IPEX-LLM OPT: sdpa
120
119
  attn_weights = None
121
- if use_flash_attention(query_states, key_states, attention_mask):
122
- attn_output = F.scaled_dot_product_attention(query_states.to(dtype=torch.float16),
123
- key_states.to(dtype=torch.float16),
124
- value_states.to(dtype=torch.float16),
125
- is_causal=True).to(hidden_states.dtype)
120
+
121
+ if q_len > 1 and q_len != kv_seq_len:
122
+ causal_mask = torch.tril(
123
+ torch.ones((kv_seq_len, kv_seq_len), dtype=torch.bool, device=query_states.device)
124
+ ).view(1, 1, kv_seq_len, kv_seq_len)
125
+ causal_mask = causal_mask[
126
+ :, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
127
+ ]
128
+ attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
129
+ device=query_states.device)
130
+ attention_mask.masked_fill_(causal_mask.logical_not(),
131
+ torch.finfo(attention_mask.dtype).min)
132
+ attention_mask = attention_mask.expand([bsz, -1, -1, -1])
126
133
  else:
127
- if q_len > 1 and q_len != kv_seq_len:
128
- causal_mask = torch.tril(
129
- torch.ones((kv_seq_len, kv_seq_len), dtype=torch.bool, device=query_states.device)
130
- ).view(1, 1, kv_seq_len, kv_seq_len)
131
- causal_mask = causal_mask[
132
- :, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
133
- ]
134
- attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
135
- device=query_states.device)
136
- attention_mask.masked_fill_(causal_mask.logical_not(),
137
- torch.finfo(attention_mask.dtype).min)
138
- attention_mask = attention_mask.expand([bsz, -1, -1, -1])
139
- else:
140
- attention_mask = None
134
+ attention_mask = None
141
135
 
142
- attn_output = scaled_dot_product_attention(
143
- query_states, key_states, value_states,
144
- attention_mask, q_len == kv_seq_len
145
- )
136
+ attn_output = scaled_dot_product_attention(
137
+ query_states, key_states, value_states,
138
+ attention_mask, q_len == kv_seq_len
139
+ )
146
140
 
147
141
  attn_output = attn_output.transpose(1, 2).contiguous()
148
142
  attn_output = attn_output.view(bsz, q_len, self.hidden_size)
@@ -219,31 +213,25 @@ def qwen_attention_forward_registered(
219
213
  past_key_value = (key_states.transpose(1, 2),
220
214
  value_states.transpose(1, 2)) if use_cache else None
221
215
 
222
- # IPEX-LLM OPT: sdp
216
+ # IPEX-LLM OPT: sdpa
223
217
  attn_weights = None
224
218
 
225
- if use_flash_attention(query_states, key_states, attention_mask):
226
- attn_output = F.scaled_dot_product_attention(query_states.to(dtype=torch.float16),
227
- key_states.to(dtype=torch.float16),
228
- value_states.to(dtype=torch.float16),
229
- is_causal=True).to(hidden_states.dtype)
219
+ if q_len > 1 and q_len != kv_seq_len:
220
+ causal_mask = registered_causal_mask[
221
+ :, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
222
+ ]
223
+ attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
224
+ device=query_states.device)
225
+ attention_mask.masked_fill_(causal_mask.logical_not(),
226
+ torch.finfo(attention_mask.dtype).min)
227
+ attention_mask = attention_mask.expand([bsz, -1, -1, -1])
230
228
  else:
231
- if q_len > 1 and q_len != kv_seq_len:
232
- causal_mask = registered_causal_mask[
233
- :, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
234
- ]
235
- attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
236
- device=query_states.device)
237
- attention_mask.masked_fill_(causal_mask.logical_not(),
238
- torch.finfo(attention_mask.dtype).min)
239
- attention_mask = attention_mask.expand([bsz, -1, -1, -1])
240
- else:
241
- attention_mask = None
229
+ attention_mask = None
242
230
 
243
- attn_output = scaled_dot_product_attention(
244
- query_states, key_states, value_states,
245
- attention_mask, q_len == kv_seq_len
246
- )
231
+ attn_output = scaled_dot_product_attention(
232
+ query_states, key_states, value_states,
233
+ attention_mask, q_len == kv_seq_len
234
+ )
247
235
 
248
236
  attn_output = attn_output.transpose(1, 2).contiguous()
249
237
  attn_output = attn_output.view(bsz, q_len, self.hidden_size)
@@ -38,12 +38,10 @@
38
38
  #
39
39
 
40
40
  import os
41
- import math
42
41
  from typing import Optional, Tuple, Union, List
43
42
 
44
43
  import torch
45
44
  from torch.nn import CrossEntropyLoss
46
- from torch.nn.functional import scaled_dot_product_attention as sdpa
47
45
 
48
46
  from ipex_llm.transformers.models.common import merge_qkv_base
49
47
  from ipex_llm.transformers.models.common import scaled_dot_product_attention
@@ -51,13 +49,12 @@ from ipex_llm.transformers.models.utils import SILU, mlp_fusion_check
51
49
  from ipex_llm.transformers.models.utils import should_use_fuse_rope
52
50
  from ipex_llm.transformers.models.utils import use_quantize_kv_cache, \
53
51
  should_use_compresskv, is_enough_kv_cache_room_4_36
54
- from ipex_llm.transformers.models.utils import use_flash_attention
55
52
  from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache, \
56
53
  DynamicCompressCache, DynamicCompressFp8Cache
57
54
  from ipex_llm.utils.common import invalidInputError
58
55
 
59
56
  from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2MLP
60
- from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb, repeat_kv
57
+ from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb
61
58
  from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
62
59
  from transformers.cache_utils import Cache
63
60
  from transformers import logging
@@ -580,21 +577,10 @@ def qwen2_attention_forward(
580
577
  self.layer_idx, None)
581
578
 
582
579
  attn_weights = None
583
- if use_flash_attention(query_states, key_states, attention_mask):
584
- if attention_mask is not None:
585
- attention_mask = attention_mask[:, :, :, :kv_seq_len]
586
- # repeat k/v heads if n_kv_heads < n_heads
587
- key_states = repeat_kv(key_states, self.num_key_value_groups)
588
- value_states = repeat_kv(value_states, self.num_key_value_groups)
589
- attn_output = sdpa(query_states.to(device, dtype=torch.float16),
590
- key_states.to(device, dtype=torch.float16),
591
- value_states.to(device, dtype=torch.float16),
592
- is_causal=True).to(hidden_states.dtype)
593
- else:
594
- attn_output = scaled_dot_product_attention(
595
- query_states, key_states, value_states,
596
- attention_mask, q_len == kv_seq_len
597
- )
580
+ attn_output = scaled_dot_product_attention(
581
+ query_states, key_states, value_states,
582
+ attention_mask, q_len == kv_seq_len
583
+ )
598
584
 
599
585
  attn_output = attn_output.transpose(1, 2).contiguous()
600
586
  attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
@@ -301,8 +301,7 @@ class _BaseAutoModelClass:
301
301
  model.share_memory()
302
302
 
303
303
  if not pipeline:
304
- if (not hasattr(model, 'llm') and
305
- model.config.model_type in ["qwen2", "llama", "minicpm"]):
304
+ if model.config.model_type in ["qwen2", "llama", "minicpm"]:
306
305
  from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
307
306
  optimize_llm_single_process(
308
307
  llm,
@@ -312,7 +311,8 @@ class _BaseAutoModelClass:
312
311
  group_size=quantization_group_size,
313
312
  qtype=qtype,
314
313
  save_directory=save_directory,
315
- fuse_layers=fuse_layers
314
+ fuse_layers=fuse_layers,
315
+ has_llm=hasattr(model, "llm")
316
316
  )
317
317
  else:
318
318
  optimize_llm(
@@ -449,7 +449,8 @@ def optimize_llm_single_process(
449
449
  group_size: int,
450
450
  qtype: str,
451
451
  save_directory: str,
452
- fuse_layers: int=None
452
+ fuse_layers: int=None,
453
+ has_llm: bool=False
453
454
  ):
454
455
  from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
455
456
  from .npu_llm_cpp import load_model_from_file
@@ -468,8 +469,13 @@ def optimize_llm_single_process(
468
469
  model.kv_len = kv_len
469
470
  model.model_ptr = model_ptr
470
471
  model.save_directory = save_directory
471
- model.vocab_size = model.config.vocab_size
472
+ if model.config.vocab_size == 151666:
473
+ # for MiniCPM-V 2.6, 152064 is vocab_size of Qwen2-7B
474
+ model.vocab_size = 152064
475
+ else:
476
+ model.vocab_size = model.config.vocab_size
472
477
  model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32)
478
+ model.max_prompt_len = max_prompt_len
473
479
  except:
474
480
  invalidInputError(False,
475
481
  "False to InitLLMPipeline.")
@@ -478,9 +484,10 @@ def optimize_llm_single_process(
478
484
  general_convert(model, PreTrainedModel, prepare_input_ids, "prepare_inputs_for_generation")
479
485
  general_convert(model, PreTrainedModel, causal_lm_forward)
480
486
  # patch generate function
481
- import types
482
- model.original_generate = model.generate
483
- model.generate = types.MethodType(generate, model)
487
+ if not has_llm:
488
+ import types
489
+ model.original_generate = model.generate
490
+ model.generate = types.MethodType(generate, model)
484
491
  return model
485
492
 
486
493
 
@@ -491,9 +498,10 @@ def prepare_input_ids(
491
498
  else: # prefill, reset the model here
492
499
  from .npu_llm_cpp import reset
493
500
  reset(self.model_ptr)
494
- model_inputs = {
495
- "input_ids": input_ids
496
- }
501
+ if inputs_embeds is not None and past_key_values is None:
502
+ model_inputs = {"inputs_embeds": inputs_embeds}
503
+ else:
504
+ model_inputs = {"input_ids": input_ids}
497
505
  return model_inputs
498
506
 
499
507
 
@@ -511,17 +519,31 @@ def causal_lm_forward(
511
519
  return_dict: Optional[bool] = None,
512
520
  ) -> Union[Tuple, CausalLMOutputWithPast]:
513
521
  from .npu_llm_cpp import run_prefill_with_logits, run_decode_with_logits
514
- if isinstance(input_ids[0], torch.Tensor):
515
- input_list = input_ids[0].flatten().tolist()
516
- else:
517
- input_list = input_ids[0]
518
- input_length = len(input_list)
519
- if input_length > 1:
520
- logits = run_prefill_with_logits(self.model_ptr, input_list,
521
- self.logits_buffer, self.vocab_size)
522
+ if input_ids is not None:
523
+ if isinstance(input_ids[0], torch.Tensor):
524
+ input_list = input_ids[0].flatten().tolist()
525
+ else:
526
+ input_list = input_ids[0]
527
+ input_length = len(input_list)
528
+ if input_length > 1:
529
+ logits = run_prefill_with_logits(self.model_ptr, input_list,
530
+ self.logits_buffer, self.vocab_size)
531
+ else:
532
+ logits = run_decode_with_logits(self.model_ptr, input_list[0],
533
+ self.logits_buffer, self.vocab_size)
534
+ elif inputs_embeds is not None:
535
+ seq_len = inputs_embeds.shape[1]
536
+ pad_len = self.max_prompt_len - seq_len
537
+ inputs_embeds = torch.nn.functional.pad(inputs_embeds.to(torch.float16),
538
+ (0, 0, 0, pad_len), value=0.0)
539
+ logits = run_prefill_with_logits(self.model_ptr, None, self.logits_buffer,
540
+ self.vocab_size, inputs_embeds, seq_len)
522
541
  else:
523
- logits = run_decode_with_logits(self.model_ptr, input_list[0],
524
- self.logits_buffer, self.vocab_size)
542
+ invalidInputError(False, "Please specify either input_ids or inputs_embeds.")
543
+
544
+ if self.config.vocab_size == 151666:
545
+ # for MiniCPM-V 2.6
546
+ logits = logits[:, :, :151666]
525
547
 
526
548
  return CausalLMOutputWithPast(
527
549
  loss=None,
@@ -48,8 +48,8 @@ _lib = ctypes.cdll.LoadLibrary(_lib_path)
48
48
  _lib.load_model_from_file.argtypes = [ctypes.c_char_p]
49
49
  _lib.load_model_from_file.restype = ctypes.c_void_p
50
50
 
51
- _lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), ctypes.c_int,
52
- ctypes.c_float]
51
+ _lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int,
52
+ ctypes.c_float, ctypes.c_bool]
53
53
  _lib.run_prefill.restype = ctypes.POINTER(ctypes.c_float)
54
54
 
55
55
  _lib.run_decode.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_float]
@@ -61,8 +61,10 @@ _lib.llm_sample_token.restype = ctypes.c_int
61
61
  _lib.reset.argtypes = [ctypes.c_void_p]
62
62
  _lib.reset.restype = None
63
63
 
64
- _lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int),
65
- ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.c_int]
64
+ _lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_void_p,
65
+ ctypes.c_int, ctypes.POINTER(ctypes.c_float),
66
+ ctypes.c_int, ctypes.c_bool]
67
+
66
68
  _lib.run_prefill_with_logits.restype = None
67
69
 
68
70
  _lib.run_decode_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_int,
@@ -77,7 +79,7 @@ def load_model_from_file(model_dir: str):
77
79
  def run_prefill(model_ptr, input_ids, vocab_size, repetition_penalty=1.0):
78
80
  input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
79
81
  input_len = len(input_ids)
80
- plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty)
82
+ plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty, False)
81
83
  new_token = _lib.llm_sample_token(plogits, True, vocab_size)
82
84
  return new_token
83
85
 
@@ -88,12 +90,19 @@ def run_decode(model_ptr, input_id, vocab_size, repetition_penalty=1.0):
88
90
  return new_token
89
91
 
90
92
 
91
- def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size):
92
- input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
93
- input_len = len(input_ids)
93
+ def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size,
94
+ inputs_embeds=None, seq_len=None):
95
+ if input_ids is not None:
96
+ input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
97
+ input_len = len(input_ids)
98
+ else:
99
+ input_ptr = inputs_embeds.contiguous().data.data_ptr()
100
+ input_ptr = ctypes.cast(input_ptr, ctypes.c_void_p)
101
+ input_len = seq_len
94
102
  logits_ptr = logits.data.data_ptr()
95
103
  logits_ptr = ctypes.cast(logits_ptr, ctypes.POINTER(ctypes.c_float))
96
- _lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr, vocab_size)
104
+ _lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr,
105
+ vocab_size, (input_ids is None))
97
106
  return logits
98
107
 
99
108
 
@@ -34,6 +34,10 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
34
34
  lm_head_n_splits = 1
35
35
  asym = getattr(model.config, "asym", False)
36
36
 
37
+ if vocab_size == 151666:
38
+ # for MiniCPM-V 2.6 lm_head on NPU
39
+ vocab_size = 152064
40
+
37
41
  if not isinstance(lm_head, SlicedLMHead):
38
42
  asym = lm_head.qtype == "asym_int4_rtn"
39
43
  if asym:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ipex-llm
3
- Version: 2.2.0b20250101
3
+ Version: 2.2.0b20250102
4
4
  Summary: Large Language Model Develop Toolkit
5
5
  Home-page: https://github.com/intel-analytics/ipex-llm
6
6
  Author: BigDL Authors
@@ -27,10 +27,10 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
27
27
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
28
28
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
29
29
  Provides-Extra: cpp
30
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250101 ; extra == 'cpp'
30
+ Requires-Dist: bigdl-core-cpp ==2.6.0b20250102 ; extra == 'cpp'
31
31
  Requires-Dist: setuptools ; extra == 'cpp'
32
32
  Provides-Extra: cpp-arl
33
- Requires-Dist: bigdl-core-cpp ==2.6.0b20250101 ; extra == 'cpp-arl'
33
+ Requires-Dist: bigdl-core-cpp ==2.6.0b20250102 ; extra == 'cpp-arl'
34
34
  Requires-Dist: setuptools ; extra == 'cpp-arl'
35
35
  Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
36
36
  Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
@@ -65,7 +65,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
65
65
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
66
66
  Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
67
67
  Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
68
- Requires-Dist: bigdl-core-npu ==2.6.0b20250101 ; (platform_system == "Windows") and extra == 'npu'
68
+ Requires-Dist: bigdl-core-npu ==2.6.0b20250102 ; (platform_system == "Windows") and extra == 'npu'
69
69
  Provides-Extra: serving
70
70
  Requires-Dist: py-cpuinfo ; extra == 'serving'
71
71
  Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -85,9 +85,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
85
85
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
86
86
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
87
87
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
88
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250101 ; extra == 'xpu'
89
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250101 ; extra == 'xpu'
90
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250101 ; extra == 'xpu'
88
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250102 ; extra == 'xpu'
89
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250102 ; extra == 'xpu'
90
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250102 ; extra == 'xpu'
91
91
  Provides-Extra: xpu-2-1
92
92
  Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
93
93
  Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -102,9 +102,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
102
102
  Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
103
103
  Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
104
104
  Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
105
- Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250101 ; extra == 'xpu-2-1'
106
- Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250101 ; extra == 'xpu-2-1'
107
- Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250101 ; extra == 'xpu-2-1'
105
+ Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250102 ; extra == 'xpu-2-1'
106
+ Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250102 ; extra == 'xpu-2-1'
107
+ Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250102 ; extra == 'xpu-2-1'
108
108
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
109
109
  Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
110
110
  Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -119,9 +119,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
119
119
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
120
120
  Requires-Dist: tabulate ; extra == 'xpu-arc'
121
121
  Requires-Dist: setuptools ; extra == 'xpu-arc'
122
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250101 ; extra == 'xpu-arc'
123
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250101 ; extra == 'xpu-arc'
124
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250101 ; extra == 'xpu-arc'
122
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250102 ; extra == 'xpu-arc'
123
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250102 ; extra == 'xpu-arc'
124
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250102 ; extra == 'xpu-arc'
125
125
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
126
126
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
127
127
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -141,9 +141,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
141
141
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
142
142
  Requires-Dist: tabulate ; extra == 'xpu-arl'
143
143
  Requires-Dist: setuptools ; extra == 'xpu-arl'
144
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250101 ; extra == 'xpu-arl'
145
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250101 ; extra == 'xpu-arl'
146
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250101 ; extra == 'xpu-arl'
144
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250102 ; extra == 'xpu-arl'
145
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250102 ; extra == 'xpu-arl'
146
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250102 ; extra == 'xpu-arl'
147
147
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
148
148
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
149
149
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -163,9 +163,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
163
163
  Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
164
164
  Requires-Dist: tabulate ; extra == 'xpu-lnl'
165
165
  Requires-Dist: setuptools ; extra == 'xpu-lnl'
166
- Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250101 ; extra == 'xpu-lnl'
167
- Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250101 ; extra == 'xpu-lnl'
168
- Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250101 ; extra == 'xpu-lnl'
166
+ Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250102 ; extra == 'xpu-lnl'
167
+ Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250102 ; extra == 'xpu-lnl'
168
+ Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250102 ; extra == 'xpu-lnl'
169
169
  Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
170
170
  Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
171
171
  Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
@@ -2,7 +2,7 @@ ipex_llm/__init__.py,sha256=kSA9JjVLPlpN4YchWtfOybRh4XiP6d_VTYvzbAouPSU,2118
2
2
  ipex_llm/convert_model.py,sha256=jopEe6wu88ZPZfNFhgnQUu7807iciiWW_EMyTsVni5A,6816
3
3
  ipex_llm/llm_patching.py,sha256=becMYcawtR8lgl2yeRQhvvX6CLaq09WZGm9dDmLJWL0,3232
4
4
  ipex_llm/models.py,sha256=XROP6GLLrGQDlogGXpXZENbV143YNi6j0VPJeOdQ3Cg,1063
5
- ipex_llm/optimize.py,sha256=4VYz8vgxSnrqBJhz__eB7hCJSwrkNx_t_wvTLxPlPyI,12253
5
+ ipex_llm/optimize.py,sha256=ml-qEpzsrWGcd-Wia6IxPBC1PhqT5pi_lp3VTOH_ns0,12415
6
6
  ipex_llm/cli/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
7
7
  ipex_llm/cli/llm-chat.ps1,sha256=6qrs-hGVAV8IKh7Jx8nq_XrnZcjd7qGU5wndArM7Yag,2769
8
8
  ipex_llm/cli/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
@@ -94,17 +94,17 @@ ipex_llm/serving/fastchat/tgi_api_protocol.py,sha256=brT3k3-V0NJrU4fRqUwWjC0O3iO
94
94
  ipex_llm/serving/fastchat/tgi_api_server.py,sha256=agNTAEiZPSuj3dEdIdYKwkoY0cXOUDX06DiM9VP2knQ,24418
95
95
  ipex_llm/serving/fastchat/vllm_worker.py,sha256=ZLz2Q9GxJO6r_LOiP6epgCRjBGk-K4EB1SNEWSJp5DA,11091
96
96
  ipex_llm/transformers/__init__.py,sha256=l4KkMkLe-pRC7b_kj6LCfeifgE-Uo33_Av_FwN9HnFA,1074
97
- ipex_llm/transformers/convert.py,sha256=B4oI836JHEqg_qT3dcl2RaJdQs7rOyigMwj-racxhkc,106379
97
+ ipex_llm/transformers/convert.py,sha256=V4KDyi-2FVWSYZAxe4PlAxGGZbauSbOCuqq56ME9yyQ,106461
98
98
  ipex_llm/transformers/convert_ipex.py,sha256=iKXo0n8fVFTOA2fNYYrByMFK0dovL-kLd2sVDk88AlQ,14334
99
99
  ipex_llm/transformers/embedding.py,sha256=bdgk59DvD4ZZyxRzewXOR7g56nThgO6uhIwk8QL7f-s,9299
100
100
  ipex_llm/transformers/kv.py,sha256=k4TU18LlA-Sbq9WNNQnfuzu3RSFBwFhmaV3BcGN5bAo,19191
101
101
  ipex_llm/transformers/lisa.py,sha256=F5WxbtXQ7RdKulj83h_2DnEIgKiKGZf7zvOmg6QBl2s,3289
102
102
  ipex_llm/transformers/loader.py,sha256=cOgX93xOC-4dt01GTJ5wyd7PjZ8S43r4mctkR2YxVuw,6893
103
103
  ipex_llm/transformers/lookup.py,sha256=c4ETIha6ZLbWvhcclSKRDdi5Ipuet4mfUnOkBa0E8kk,19607
104
- ipex_llm/transformers/low_bit_linear.py,sha256=TJfEqNp6zB6YnNEUASga302WQXzNdrmU_miGCM0u-F8,41504
105
- ipex_llm/transformers/model.py,sha256=N-g9IQVvBiBhbL5Fo5DTWbmHPZY52sjfFuq0B8Qu6h4,40952
104
+ ipex_llm/transformers/low_bit_linear.py,sha256=dyyYyCqw0GK8hzaUGanrg-uIhU1HTLEEbvbxXMlm-80,41668
105
+ ipex_llm/transformers/model.py,sha256=KcRjkauGg48BYrUBoUZaVMpg7Piuz5JrfIpVZd3EIjs,41105
106
106
  ipex_llm/transformers/modelling_bigdl.py,sha256=7JpNVMuyq_OmtNUaMFMXdxPWZp2q0QHC02QeA-VTPOw,6709
107
- ipex_llm/transformers/npu_model.py,sha256=wPFEB4W1rYbpO_XqepREMef69dzo-zkFoqFRb_mqneA,37862
107
+ ipex_llm/transformers/npu_model.py,sha256=a1mkyc6EqD7AJhqbYzokGhFubNpt5trIMuZT_dQKlTk,37861
108
108
  ipex_llm/transformers/patches.py,sha256=halPWm__ORh2fRFSIFPiCNg3LQBfrRkTPtmtRpBJCZQ,1286
109
109
  ipex_llm/transformers/pipeline_parallel.py,sha256=uNZpOXljNmdoEYnP8U-VFiN4dRZb2piQbIf2bG9LQnE,49051
110
110
  ipex_llm/transformers/qlora.py,sha256=jtPGsvWFjbTUGzDBCdfftnCis_0nJQNRpACSwXUbbGU,14943
@@ -143,10 +143,10 @@ ipex_llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py,sh
143
143
  ipex_llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py,sha256=_AOGMV65XHxgTxIib7lgs49InopcecTzRwgtYR8NTUg,51084
144
144
  ipex_llm/transformers/models/__init__.py,sha256=tp2DcVkKg1-QvdYk7DY7rZvQWCDQ4ZjU8NAQ7Fclrpg,584
145
145
  ipex_llm/transformers/models/aquila.py,sha256=VZb5Drpo_fTxwcExZ397LygnsNPX2sVbie9_JeFudZI,5252
146
- ipex_llm/transformers/models/baichuan.py,sha256=0dkTSPqGPgSnwa8zSNKroam0pvSyQLSRpbb43-OgIlc,19815
146
+ ipex_llm/transformers/models/baichuan.py,sha256=oJCAEENSG8oQhJ-QPN2SiapARjAGdOM6nEbyCcYOMCo,19334
147
147
  ipex_llm/transformers/models/bert.py,sha256=bJNic2pt1kph0kBwdK5MRGyWupFfx2Ts0V3D1L-5kWo,6085
148
148
  ipex_llm/transformers/models/bloom.py,sha256=PxfzyYT-nFn3K5rZhTQjmcEjUUzAhUFzxIN4kzRlCuc,8103
149
- ipex_llm/transformers/models/chatglm.py,sha256=xCEhYzaXyTDBXqz111Uw4IW5x4TLbtBbYfmBT623gRI,12669
149
+ ipex_llm/transformers/models/chatglm.py,sha256=UHai1t2AUtGmF765_eHF8LUMVQzp_oCBx8TJB21WrHk,12597
150
150
  ipex_llm/transformers/models/chatglm2.py,sha256=kfJThuKYb3unAB1XCzfop1iDW1gOkyFOjSr-lEjUdS0,24781
151
151
  ipex_llm/transformers/models/chatglm4.py,sha256=AAhAFFDDas5DBQPfh2Mwl7a2v7taKf6xphoeeNNFaBI,16593
152
152
  ipex_llm/transformers/models/chatglm4v.py,sha256=YRfuf9g1E0MQ_7wbHAOMvadFnO-j3LqI_k1SaRkDs0M,14055
@@ -174,8 +174,8 @@ ipex_llm/transformers/models/mpt.py,sha256=z02NwHogJZVh-Mk4sYoIzR90SFIKhoNN_-ifs
174
174
  ipex_llm/transformers/models/phi.py,sha256=E6qz4EEuHIVGvaPo-wtLC5lz3iyMqTbAE_cRlcjQRKI,6670
175
175
  ipex_llm/transformers/models/phi3.py,sha256=jkiadJ85ToHpymY5GOM6orWlnx6LKN8_-v1MUcfGWPg,15159
176
176
  ipex_llm/transformers/models/phixtral.py,sha256=MDTMghcu7qAmZmRcUGqXXDXhSU3y_N59HRIXmlcjp5g,4890
177
- ipex_llm/transformers/models/qwen.py,sha256=iP4wcjdIZ0CvqbM8muM96y-rghpEnZSR3TgjiOBIq5k,20475
178
- ipex_llm/transformers/models/qwen2.py,sha256=k_FcPfPAXIotRE03ULFBYA5hWE0M5CfktAhCWbEy8Yw,26369
177
+ ipex_llm/transformers/models/qwen.py,sha256=XIJ_bLzediBURWU-OOS3H6WBIGXQue6jDdUHJsAabwY,19391
178
+ ipex_llm/transformers/models/qwen2.py,sha256=b49HO4GSudwGJ3n6uHVno1oo3DgRt3jOjtQnLOB3cdY,25530
179
179
  ipex_llm/transformers/models/qwen2_moe.py,sha256=EA_OYxYAEgrvi7VpDW192AJXG9Fwe2aBtOAZPkOAJk4,19350
180
180
  ipex_llm/transformers/models/qwen2_vl.py,sha256=jIm4yZSd751BkRqgj3wR1QBkDIh-TMCLAMM8SZ8n6Qo,13419
181
181
  ipex_llm/transformers/models/qwen_vl.py,sha256=j7Nzzz2Qvynu9yrCXmoEfERjw43hXof5TbXIs7Ms-oY,17105
@@ -192,7 +192,7 @@ ipex_llm/transformers/npu_models/baichuan_mp.py,sha256=tHhO-0v5z6IhxsfzAPYWXVbLr
192
192
  ipex_llm/transformers/npu_models/chatglm.py,sha256=YzpGLZ7ORt6qkwW9mCwZ_xhOAI8uHSDHJrmqWgNM234,10511
193
193
  ipex_llm/transformers/npu_models/chatglm4.py,sha256=J4523DzhIzZxIvlf1V9qU4auzEGKvC80YqyxuCJygjw,9795
194
194
  ipex_llm/transformers/npu_models/common.py,sha256=tTUJL7IxVrJSnXle6nla35wTUrBf2sOEt7Ya1qyMezY,4853
195
- ipex_llm/transformers/npu_models/convert.py,sha256=cX10r7Q0a2qFcEIhRcvmw1eSmUtmeOcoNn8kVqv3Su8,24224
195
+ ipex_llm/transformers/npu_models/convert.py,sha256=FILSGnoltcR9FMrCkw0eOKh6p3sbBI5i0Ms8AsJc04E,25342
196
196
  ipex_llm/transformers/npu_models/convert_mp.py,sha256=t7160V4MmYpnex2NfuLTcqoc1meGEXdYi4AAPotfbzk,24518
197
197
  ipex_llm/transformers/npu_models/glm_edge.py,sha256=VsJex-6530h4ZQk35TxRe1MnttAHT41omE8LV47LgBE,6723
198
198
  ipex_llm/transformers/npu_models/kv.py,sha256=2OSFO9Z6e4nGdVxXEM-Bq2qa_npYYbGmQt3lcCZxTlU,9201
@@ -205,7 +205,7 @@ ipex_llm/transformers/npu_models/minicpm_mp.py,sha256=0iCRWN9UIUQp5tSKyu-orpGCOx
205
205
  ipex_llm/transformers/npu_models/minicpmv_mp.py,sha256=m11WT6s_H5wkFtlz7aHMOL9b_CoL_G5MhoL5te4la_Q,20147
206
206
  ipex_llm/transformers/npu_models/mistral.py,sha256=iRdmIQI_bbbZxRCYRvnV4rWjX2t-6vkHNl1ICAsLoy4,10759
207
207
  ipex_llm/transformers/npu_models/mp_models_base.py,sha256=rY-5tq8DfxRsiaIITl0PQOTiPLJnUm_5L-oWzbK12N8,28429
208
- ipex_llm/transformers/npu_models/npu_llm_cpp.py,sha256=SL1p5UBjheufhum-GktNQQ4iXjRlZ9Hgatzb3NFr6Bs,3900
208
+ ipex_llm/transformers/npu_models/npu_llm_cpp.py,sha256=B40sBujvy31ETFBgcYAf4CN23UuTCBEJVaxjIMaoEHk,4268
209
209
  ipex_llm/transformers/npu_models/paraformer_mp.py,sha256=lGEjmKHW_Pk3BE3nqa1ZVgJ3P5p4lNp7p6wMV7KrtCU,37871
210
210
  ipex_llm/transformers/npu_models/phi3.py,sha256=R-EuqHsTrPTX33HtCGAMFlRdXB_j5mH_7FDnj62JtNM,6555
211
211
  ipex_llm/transformers/npu_models/phi3_v.py,sha256=EMZuTPkGfuDVp9c5BU1HyzXHWKswHRQ8bvQjzocIyHA,7737
@@ -220,7 +220,7 @@ ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256=953Gua2tFKLI
220
220
  ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=MnvHRytLt3oy5jIPUBe8AeEJ6PtPWLbhQ5a9WqjZ1TQ,19905
221
221
  ipex_llm/transformers/npu_pipeline_model/minicpm.py,sha256=MDMesYlVbECKdK0xxkt1LwHgpkJOO7ZwBExYAwMGQa0,20637
222
222
  ipex_llm/transformers/npu_pipeline_model/pipeline_cpp.py,sha256=JNmodAMg_NQvDILug3E_fGXEh6cd3wsj4bvAzcd-vaU,2749
223
- ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=iVUNrcRLXE5eQGZIetgkLTINo8RW15RHM7SUetvJNRQ,14748
223
+ ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=3paMXr1viuztybhmVLqQ9XvM3EZbxncDuNSNwLF8OI0,14849
224
224
  ipex_llm/utils/__init__.py,sha256=NdB_InYE65dNgW4ruEPUOlgKEO2ELcsJoqkP7O5kpog,1391
225
225
  ipex_llm/utils/benchmark_util_4_29.py,sha256=OU1W1quiaiJGsg1pd3HM9O6PmVSaPA0HHE7R8hNTfmQ,258653
226
226
  ipex_llm/utils/benchmark_util_4_42.py,sha256=HEiClCgKDp_T64HH8ulSTly8dvt6UwPDYZfrPVYvXcc,225383
@@ -251,11 +251,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
251
251
  ipex_llm/vllm/xpu/engine/engine.py,sha256=k4-D27WS_Gk3mA--w3HWAjPjb4Aiu043MVPi0ZoAUBc,5984
252
252
  ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=GshTZFB8e4PWvqckfbmTOU6b0oLkNn7A-vzLuG9--j8,21544
253
253
  ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=2rENA2ucynMaIjiZBEh2ez1o5vR32GaP514t39CD7KM,8676
254
- ipex_llm-2.2.0b20250101.data/scripts/ipex-llm-init,sha256=fLQsT2dRL6H5bThb4GuIWotAuqoLsIxFwA-0c2qmaO8,6672
255
- ipex_llm-2.2.0b20250101.data/scripts/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
256
- ipex_llm-2.2.0b20250101.data/scripts/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
257
- ipex_llm-2.2.0b20250101.dist-info/METADATA,sha256=J_SK__bQX0TB_vOUTfpk9DV4DhTzVc-LzVzN_LmERV8,11374
258
- ipex_llm-2.2.0b20250101.dist-info/WHEEL,sha256=PPJcBMAZibF_2GFE9NmOJGqiaSMPiNFbJd6QaJjdA6Y,109
259
- ipex_llm-2.2.0b20250101.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
260
- ipex_llm-2.2.0b20250101.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
261
- ipex_llm-2.2.0b20250101.dist-info/RECORD,,
254
+ ipex_llm-2.2.0b20250102.data/scripts/ipex-llm-init,sha256=fLQsT2dRL6H5bThb4GuIWotAuqoLsIxFwA-0c2qmaO8,6672
255
+ ipex_llm-2.2.0b20250102.data/scripts/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
256
+ ipex_llm-2.2.0b20250102.data/scripts/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
257
+ ipex_llm-2.2.0b20250102.dist-info/METADATA,sha256=fF_EkmZQW5wODRZlaJEQgQnS6Xieiem4h1vZcvsRxRE,11374
258
+ ipex_llm-2.2.0b20250102.dist-info/WHEEL,sha256=PPJcBMAZibF_2GFE9NmOJGqiaSMPiNFbJd6QaJjdA6Y,109
259
+ ipex_llm-2.2.0b20250102.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
260
+ ipex_llm-2.2.0b20250102.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
261
+ ipex_llm-2.2.0b20250102.dist-info/RECORD,,