ipex-llm 2.2.0b20250101__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250102__py3-none-manylinux2010_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/optimize.py +3 -1
- ipex_llm/transformers/convert.py +3 -2
- ipex_llm/transformers/low_bit_linear.py +8 -2
- ipex_llm/transformers/model.py +3 -1
- ipex_llm/transformers/models/baichuan.py +5 -11
- ipex_llm/transformers/models/chatglm.py +2 -2
- ipex_llm/transformers/models/qwen.py +34 -46
- ipex_llm/transformers/models/qwen2.py +5 -19
- ipex_llm/transformers/npu_model.py +3 -3
- ipex_llm/transformers/npu_models/convert.py +40 -18
- ipex_llm/transformers/npu_models/npu_llm_cpp.py +18 -9
- ipex_llm/transformers/npu_pipeline_model/qwen.py +4 -0
- {ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/METADATA +19 -19
- {ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/RECORD +20 -20
- {ipex_llm-2.2.0b20250101.data → ipex_llm-2.2.0b20250102.data}/scripts/ipex-llm-init +0 -0
- {ipex_llm-2.2.0b20250101.data → ipex_llm-2.2.0b20250102.data}/scripts/llm-chat +0 -0
- {ipex_llm-2.2.0b20250101.data → ipex_llm-2.2.0b20250102.data}/scripts/llm-cli +0 -0
- {ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250101.dist-info → ipex_llm-2.2.0b20250102.dist-info}/top_level.txt +0 -0
ipex_llm/optimize.py
CHANGED
@@ -254,7 +254,9 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_
|
|
254
254
|
torch_dtype=torch_dtype,
|
255
255
|
optimize_model=optimize_llm,
|
256
256
|
modules_to_not_convert=modules_to_not_convert,
|
257
|
-
cpu_embedding=cpu_embedding
|
257
|
+
cpu_embedding=cpu_embedding,
|
258
|
+
disable_optimize_pre=kwargs.pop("disable_optimize_pre",
|
259
|
+
False))
|
258
260
|
# add save_low_bit to pretrained model dynamically
|
259
261
|
import types
|
260
262
|
model._bigdl_config = dict()
|
ipex_llm/transformers/convert.py
CHANGED
@@ -1081,7 +1081,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
|
|
1081
1081
|
torch_dtype="auto",
|
1082
1082
|
imatrix_data=None,
|
1083
1083
|
embedding_qtype=None,
|
1084
|
-
mixed_precision=False
|
1084
|
+
mixed_precision=False,
|
1085
|
+
disable_optimize_pre=False):
|
1085
1086
|
if qtype in ggml_tensor_qtype.values():
|
1086
1087
|
index = list(ggml_tensor_qtype.values()).index(qtype)
|
1087
1088
|
logger.info(f"Converting the current model to "
|
@@ -1104,7 +1105,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
|
|
1104
1105
|
model = _optimize_ipex(model, qtype)
|
1105
1106
|
return model
|
1106
1107
|
|
1107
|
-
if optimize_model:
|
1108
|
+
if optimize_model and not disable_optimize_pre:
|
1108
1109
|
model = _optimize_pre(model, qtype)
|
1109
1110
|
|
1110
1111
|
act_order = False
|
@@ -764,6 +764,7 @@ class FP16Linear(nn.Linear):
|
|
764
764
|
# weigh_type = 3 means weight has been transposed by esimd method
|
765
765
|
self.weight_type = 1
|
766
766
|
self.optimize_lm_head = optimize_lm_head
|
767
|
+
self.disable_fp16_opt = False
|
767
768
|
|
768
769
|
def forward(self, x: torch.Tensor):
|
769
770
|
# only work for GPU
|
@@ -779,8 +780,11 @@ class FP16Linear(nn.Linear):
|
|
779
780
|
self.weight.data = self.weight.data.to(x.dtype)
|
780
781
|
|
781
782
|
if not self.use_esimd_kernel(x):
|
782
|
-
if
|
783
|
-
|
783
|
+
if (
|
784
|
+
get_ipex_version() < "2.1.10+xpu"
|
785
|
+
or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]
|
786
|
+
or self.disable_fp16_opt
|
787
|
+
):
|
784
788
|
if self.weight_type == 2:
|
785
789
|
self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
|
786
790
|
requires_grad=False)
|
@@ -845,6 +849,8 @@ class FP16Linear(nn.Linear):
|
|
845
849
|
|
846
850
|
def use_esimd_kernel(self, x):
|
847
851
|
gpu_type = get_xpu_device_type(x)
|
852
|
+
if self.disable_fp16_opt:
|
853
|
+
return False
|
848
854
|
# esimd kernel can only be used for Arc and Flex
|
849
855
|
if gpu_type not in ["arc", "flex"]:
|
850
856
|
return False
|
ipex_llm/transformers/model.py
CHANGED
@@ -445,6 +445,7 @@ class _BaseAutoModelClass:
|
|
445
445
|
mixed_precision = kwargs.pop("mixed_precision", False)
|
446
446
|
if embedding_qtype is not None:
|
447
447
|
embedding_qtype = ggml_tensor_qtype[embedding_qtype]
|
448
|
+
disable_optimize_pre = kwargs.pop("disable_optimize_pre", False)
|
448
449
|
_args = copy.deepcopy(args)
|
449
450
|
_kwargs = copy.deepcopy(kwargs)
|
450
451
|
awq_config = None
|
@@ -513,7 +514,8 @@ class _BaseAutoModelClass:
|
|
513
514
|
torch_dtype=kwargs.get("torch_dtype", 'auto'),
|
514
515
|
imatrix_data=imatrix_data,
|
515
516
|
embedding_qtype=embedding_qtype,
|
516
|
-
mixed_precision=mixed_precision
|
517
|
+
mixed_precision=mixed_precision,
|
518
|
+
disable_optimize_pre=disable_optimize_pre)
|
517
519
|
|
518
520
|
if disk_embedding:
|
519
521
|
from ipex_llm.transformers.embedding import DiskEmbedding
|
@@ -29,7 +29,7 @@ from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp
|
|
29
29
|
should_use_compresskv
|
30
30
|
from ipex_llm.transformers.models.utils import update_past_key_value
|
31
31
|
from ipex_llm.transformers.models.utils import should_use_fuse_rope
|
32
|
-
from ipex_llm.transformers.models.utils import
|
32
|
+
from ipex_llm.transformers.models.utils import use_sdp
|
33
33
|
from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, SILU
|
34
34
|
from ipex_llm.transformers.models.utils import mlp_fusion_check
|
35
35
|
from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_36
|
@@ -301,16 +301,10 @@ def baichuan_attention_forward_7b(
|
|
301
301
|
|
302
302
|
# IPEX-LLM OPT: sdp
|
303
303
|
attn_weights = None
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
is_causal=True).to(hidden_states.dtype)
|
309
|
-
else:
|
310
|
-
attn_output = scaled_dot_product_attention(
|
311
|
-
query_states, key_states, value_states,
|
312
|
-
attention_mask, q_len == kv_seq_len
|
313
|
-
)
|
304
|
+
attn_output = scaled_dot_product_attention(
|
305
|
+
query_states, key_states, value_states,
|
306
|
+
attention_mask, q_len == kv_seq_len
|
307
|
+
)
|
314
308
|
|
315
309
|
attn_output = attn_output.transpose(1, 2).contiguous()
|
316
310
|
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
@@ -23,7 +23,7 @@ import torch.utils.checkpoint
|
|
23
23
|
import torch.nn.functional as F
|
24
24
|
from typing import Optional, Tuple
|
25
25
|
from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache
|
26
|
-
from ipex_llm.transformers.models.utils import
|
26
|
+
from ipex_llm.transformers.models.utils import use_sdp
|
27
27
|
|
28
28
|
|
29
29
|
def rotate_half(x):
|
@@ -41,7 +41,7 @@ def apply_rotary_pos_emb_index(q, k, cos, sin, position_id):
|
|
41
41
|
|
42
42
|
|
43
43
|
def glm_sdpa(query, key, value, attention_mask=None, is_causal=False):
|
44
|
-
if
|
44
|
+
if query.device.type == 'cpu':
|
45
45
|
context_layer = F.scaled_dot_product_attention(query.to(key.dtype),
|
46
46
|
key,
|
47
47
|
value,
|
@@ -33,7 +33,6 @@ from ipex_llm.transformers.models.utils import update_past_key_value, should_use
|
|
33
33
|
from ipex_llm.transformers.models.utils import use_quantize_kv_cache
|
34
34
|
from ipex_llm.transformers.models.utils import rotate_half, SILU
|
35
35
|
from ipex_llm.transformers.models.utils import mlp_fusion_check
|
36
|
-
from ipex_llm.transformers.models.utils import use_flash_attention
|
37
36
|
from ipex_llm.utils.common import invalidInputError
|
38
37
|
from transformers.modeling_outputs import BaseModelOutputWithPast
|
39
38
|
|
@@ -116,33 +115,28 @@ def qwen_attention_forward(
|
|
116
115
|
past_key_value = (key_states.transpose(1, 2),
|
117
116
|
value_states.transpose(1, 2)) if use_cache else None
|
118
117
|
|
119
|
-
# IPEX-LLM OPT:
|
118
|
+
# IPEX-LLM OPT: sdpa
|
120
119
|
attn_weights = None
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
120
|
+
|
121
|
+
if q_len > 1 and q_len != kv_seq_len:
|
122
|
+
causal_mask = torch.tril(
|
123
|
+
torch.ones((kv_seq_len, kv_seq_len), dtype=torch.bool, device=query_states.device)
|
124
|
+
).view(1, 1, kv_seq_len, kv_seq_len)
|
125
|
+
causal_mask = causal_mask[
|
126
|
+
:, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
|
127
|
+
]
|
128
|
+
attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
|
129
|
+
device=query_states.device)
|
130
|
+
attention_mask.masked_fill_(causal_mask.logical_not(),
|
131
|
+
torch.finfo(attention_mask.dtype).min)
|
132
|
+
attention_mask = attention_mask.expand([bsz, -1, -1, -1])
|
126
133
|
else:
|
127
|
-
|
128
|
-
causal_mask = torch.tril(
|
129
|
-
torch.ones((kv_seq_len, kv_seq_len), dtype=torch.bool, device=query_states.device)
|
130
|
-
).view(1, 1, kv_seq_len, kv_seq_len)
|
131
|
-
causal_mask = causal_mask[
|
132
|
-
:, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
|
133
|
-
]
|
134
|
-
attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
|
135
|
-
device=query_states.device)
|
136
|
-
attention_mask.masked_fill_(causal_mask.logical_not(),
|
137
|
-
torch.finfo(attention_mask.dtype).min)
|
138
|
-
attention_mask = attention_mask.expand([bsz, -1, -1, -1])
|
139
|
-
else:
|
140
|
-
attention_mask = None
|
134
|
+
attention_mask = None
|
141
135
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
136
|
+
attn_output = scaled_dot_product_attention(
|
137
|
+
query_states, key_states, value_states,
|
138
|
+
attention_mask, q_len == kv_seq_len
|
139
|
+
)
|
146
140
|
|
147
141
|
attn_output = attn_output.transpose(1, 2).contiguous()
|
148
142
|
attn_output = attn_output.view(bsz, q_len, self.hidden_size)
|
@@ -219,31 +213,25 @@ def qwen_attention_forward_registered(
|
|
219
213
|
past_key_value = (key_states.transpose(1, 2),
|
220
214
|
value_states.transpose(1, 2)) if use_cache else None
|
221
215
|
|
222
|
-
# IPEX-LLM OPT:
|
216
|
+
# IPEX-LLM OPT: sdpa
|
223
217
|
attn_weights = None
|
224
218
|
|
225
|
-
if
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
219
|
+
if q_len > 1 and q_len != kv_seq_len:
|
220
|
+
causal_mask = registered_causal_mask[
|
221
|
+
:, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
|
222
|
+
]
|
223
|
+
attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
|
224
|
+
device=query_states.device)
|
225
|
+
attention_mask.masked_fill_(causal_mask.logical_not(),
|
226
|
+
torch.finfo(attention_mask.dtype).min)
|
227
|
+
attention_mask = attention_mask.expand([bsz, -1, -1, -1])
|
230
228
|
else:
|
231
|
-
|
232
|
-
causal_mask = registered_causal_mask[
|
233
|
-
:, :, kv_seq_len - q_len:kv_seq_len, :kv_seq_len
|
234
|
-
]
|
235
|
-
attention_mask = torch.zeros(causal_mask.shape, dtype=query_states.dtype,
|
236
|
-
device=query_states.device)
|
237
|
-
attention_mask.masked_fill_(causal_mask.logical_not(),
|
238
|
-
torch.finfo(attention_mask.dtype).min)
|
239
|
-
attention_mask = attention_mask.expand([bsz, -1, -1, -1])
|
240
|
-
else:
|
241
|
-
attention_mask = None
|
229
|
+
attention_mask = None
|
242
230
|
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
231
|
+
attn_output = scaled_dot_product_attention(
|
232
|
+
query_states, key_states, value_states,
|
233
|
+
attention_mask, q_len == kv_seq_len
|
234
|
+
)
|
247
235
|
|
248
236
|
attn_output = attn_output.transpose(1, 2).contiguous()
|
249
237
|
attn_output = attn_output.view(bsz, q_len, self.hidden_size)
|
@@ -38,12 +38,10 @@
|
|
38
38
|
#
|
39
39
|
|
40
40
|
import os
|
41
|
-
import math
|
42
41
|
from typing import Optional, Tuple, Union, List
|
43
42
|
|
44
43
|
import torch
|
45
44
|
from torch.nn import CrossEntropyLoss
|
46
|
-
from torch.nn.functional import scaled_dot_product_attention as sdpa
|
47
45
|
|
48
46
|
from ipex_llm.transformers.models.common import merge_qkv_base
|
49
47
|
from ipex_llm.transformers.models.common import scaled_dot_product_attention
|
@@ -51,13 +49,12 @@ from ipex_llm.transformers.models.utils import SILU, mlp_fusion_check
|
|
51
49
|
from ipex_llm.transformers.models.utils import should_use_fuse_rope
|
52
50
|
from ipex_llm.transformers.models.utils import use_quantize_kv_cache, \
|
53
51
|
should_use_compresskv, is_enough_kv_cache_room_4_36
|
54
|
-
from ipex_llm.transformers.models.utils import use_flash_attention
|
55
52
|
from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache, \
|
56
53
|
DynamicCompressCache, DynamicCompressFp8Cache
|
57
54
|
from ipex_llm.utils.common import invalidInputError
|
58
55
|
|
59
56
|
from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2MLP
|
60
|
-
from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb
|
57
|
+
from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb
|
61
58
|
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
62
59
|
from transformers.cache_utils import Cache
|
63
60
|
from transformers import logging
|
@@ -580,21 +577,10 @@ def qwen2_attention_forward(
|
|
580
577
|
self.layer_idx, None)
|
581
578
|
|
582
579
|
attn_weights = None
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
588
|
-
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
589
|
-
attn_output = sdpa(query_states.to(device, dtype=torch.float16),
|
590
|
-
key_states.to(device, dtype=torch.float16),
|
591
|
-
value_states.to(device, dtype=torch.float16),
|
592
|
-
is_causal=True).to(hidden_states.dtype)
|
593
|
-
else:
|
594
|
-
attn_output = scaled_dot_product_attention(
|
595
|
-
query_states, key_states, value_states,
|
596
|
-
attention_mask, q_len == kv_seq_len
|
597
|
-
)
|
580
|
+
attn_output = scaled_dot_product_attention(
|
581
|
+
query_states, key_states, value_states,
|
582
|
+
attention_mask, q_len == kv_seq_len
|
583
|
+
)
|
598
584
|
|
599
585
|
attn_output = attn_output.transpose(1, 2).contiguous()
|
600
586
|
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
@@ -301,8 +301,7 @@ class _BaseAutoModelClass:
|
|
301
301
|
model.share_memory()
|
302
302
|
|
303
303
|
if not pipeline:
|
304
|
-
if
|
305
|
-
model.config.model_type in ["qwen2", "llama", "minicpm"]):
|
304
|
+
if model.config.model_type in ["qwen2", "llama", "minicpm"]:
|
306
305
|
from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
|
307
306
|
optimize_llm_single_process(
|
308
307
|
llm,
|
@@ -312,7 +311,8 @@ class _BaseAutoModelClass:
|
|
312
311
|
group_size=quantization_group_size,
|
313
312
|
qtype=qtype,
|
314
313
|
save_directory=save_directory,
|
315
|
-
fuse_layers=fuse_layers
|
314
|
+
fuse_layers=fuse_layers,
|
315
|
+
has_llm=hasattr(model, "llm")
|
316
316
|
)
|
317
317
|
else:
|
318
318
|
optimize_llm(
|
@@ -449,7 +449,8 @@ def optimize_llm_single_process(
|
|
449
449
|
group_size: int,
|
450
450
|
qtype: str,
|
451
451
|
save_directory: str,
|
452
|
-
fuse_layers: int=None
|
452
|
+
fuse_layers: int=None,
|
453
|
+
has_llm: bool=False
|
453
454
|
):
|
454
455
|
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
|
455
456
|
from .npu_llm_cpp import load_model_from_file
|
@@ -468,8 +469,13 @@ def optimize_llm_single_process(
|
|
468
469
|
model.kv_len = kv_len
|
469
470
|
model.model_ptr = model_ptr
|
470
471
|
model.save_directory = save_directory
|
471
|
-
|
472
|
+
if model.config.vocab_size == 151666:
|
473
|
+
# for MiniCPM-V 2.6, 152064 is vocab_size of Qwen2-7B
|
474
|
+
model.vocab_size = 152064
|
475
|
+
else:
|
476
|
+
model.vocab_size = model.config.vocab_size
|
472
477
|
model.logits_buffer = torch.empty(1, 1, model.vocab_size, dtype=torch.float32)
|
478
|
+
model.max_prompt_len = max_prompt_len
|
473
479
|
except:
|
474
480
|
invalidInputError(False,
|
475
481
|
"False to InitLLMPipeline.")
|
@@ -478,9 +484,10 @@ def optimize_llm_single_process(
|
|
478
484
|
general_convert(model, PreTrainedModel, prepare_input_ids, "prepare_inputs_for_generation")
|
479
485
|
general_convert(model, PreTrainedModel, causal_lm_forward)
|
480
486
|
# patch generate function
|
481
|
-
|
482
|
-
|
483
|
-
|
487
|
+
if not has_llm:
|
488
|
+
import types
|
489
|
+
model.original_generate = model.generate
|
490
|
+
model.generate = types.MethodType(generate, model)
|
484
491
|
return model
|
485
492
|
|
486
493
|
|
@@ -491,9 +498,10 @@ def prepare_input_ids(
|
|
491
498
|
else: # prefill, reset the model here
|
492
499
|
from .npu_llm_cpp import reset
|
493
500
|
reset(self.model_ptr)
|
494
|
-
|
495
|
-
"
|
496
|
-
|
501
|
+
if inputs_embeds is not None and past_key_values is None:
|
502
|
+
model_inputs = {"inputs_embeds": inputs_embeds}
|
503
|
+
else:
|
504
|
+
model_inputs = {"input_ids": input_ids}
|
497
505
|
return model_inputs
|
498
506
|
|
499
507
|
|
@@ -511,17 +519,31 @@ def causal_lm_forward(
|
|
511
519
|
return_dict: Optional[bool] = None,
|
512
520
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
513
521
|
from .npu_llm_cpp import run_prefill_with_logits, run_decode_with_logits
|
514
|
-
if
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
+
if input_ids is not None:
|
523
|
+
if isinstance(input_ids[0], torch.Tensor):
|
524
|
+
input_list = input_ids[0].flatten().tolist()
|
525
|
+
else:
|
526
|
+
input_list = input_ids[0]
|
527
|
+
input_length = len(input_list)
|
528
|
+
if input_length > 1:
|
529
|
+
logits = run_prefill_with_logits(self.model_ptr, input_list,
|
530
|
+
self.logits_buffer, self.vocab_size)
|
531
|
+
else:
|
532
|
+
logits = run_decode_with_logits(self.model_ptr, input_list[0],
|
533
|
+
self.logits_buffer, self.vocab_size)
|
534
|
+
elif inputs_embeds is not None:
|
535
|
+
seq_len = inputs_embeds.shape[1]
|
536
|
+
pad_len = self.max_prompt_len - seq_len
|
537
|
+
inputs_embeds = torch.nn.functional.pad(inputs_embeds.to(torch.float16),
|
538
|
+
(0, 0, 0, pad_len), value=0.0)
|
539
|
+
logits = run_prefill_with_logits(self.model_ptr, None, self.logits_buffer,
|
540
|
+
self.vocab_size, inputs_embeds, seq_len)
|
522
541
|
else:
|
523
|
-
|
524
|
-
|
542
|
+
invalidInputError(False, "Please specify either input_ids or inputs_embeds.")
|
543
|
+
|
544
|
+
if self.config.vocab_size == 151666:
|
545
|
+
# for MiniCPM-V 2.6
|
546
|
+
logits = logits[:, :, :151666]
|
525
547
|
|
526
548
|
return CausalLMOutputWithPast(
|
527
549
|
loss=None,
|
@@ -48,8 +48,8 @@ _lib = ctypes.cdll.LoadLibrary(_lib_path)
|
|
48
48
|
_lib.load_model_from_file.argtypes = [ctypes.c_char_p]
|
49
49
|
_lib.load_model_from_file.restype = ctypes.c_void_p
|
50
50
|
|
51
|
-
_lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.
|
52
|
-
ctypes.c_float]
|
51
|
+
_lib.run_prefill.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int,
|
52
|
+
ctypes.c_float, ctypes.c_bool]
|
53
53
|
_lib.run_prefill.restype = ctypes.POINTER(ctypes.c_float)
|
54
54
|
|
55
55
|
_lib.run_decode.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_float]
|
@@ -61,8 +61,10 @@ _lib.llm_sample_token.restype = ctypes.c_int
|
|
61
61
|
_lib.reset.argtypes = [ctypes.c_void_p]
|
62
62
|
_lib.reset.restype = None
|
63
63
|
|
64
|
-
_lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.
|
65
|
-
ctypes.c_int, ctypes.POINTER(ctypes.c_float),
|
64
|
+
_lib.run_prefill_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_void_p,
|
65
|
+
ctypes.c_int, ctypes.POINTER(ctypes.c_float),
|
66
|
+
ctypes.c_int, ctypes.c_bool]
|
67
|
+
|
66
68
|
_lib.run_prefill_with_logits.restype = None
|
67
69
|
|
68
70
|
_lib.run_decode_with_logits.argtypes = [ctypes.c_void_p, ctypes.c_int,
|
@@ -77,7 +79,7 @@ def load_model_from_file(model_dir: str):
|
|
77
79
|
def run_prefill(model_ptr, input_ids, vocab_size, repetition_penalty=1.0):
|
78
80
|
input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
|
79
81
|
input_len = len(input_ids)
|
80
|
-
plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty)
|
82
|
+
plogits = _lib.run_prefill(model_ptr, input_ptr, input_len, repetition_penalty, False)
|
81
83
|
new_token = _lib.llm_sample_token(plogits, True, vocab_size)
|
82
84
|
return new_token
|
83
85
|
|
@@ -88,12 +90,19 @@ def run_decode(model_ptr, input_id, vocab_size, repetition_penalty=1.0):
|
|
88
90
|
return new_token
|
89
91
|
|
90
92
|
|
91
|
-
def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size
|
92
|
-
|
93
|
-
|
93
|
+
def run_prefill_with_logits(model_ptr, input_ids, logits, vocab_size,
|
94
|
+
inputs_embeds=None, seq_len=None):
|
95
|
+
if input_ids is not None:
|
96
|
+
input_ptr = (ctypes.c_int32 * len(input_ids))(*input_ids)
|
97
|
+
input_len = len(input_ids)
|
98
|
+
else:
|
99
|
+
input_ptr = inputs_embeds.contiguous().data.data_ptr()
|
100
|
+
input_ptr = ctypes.cast(input_ptr, ctypes.c_void_p)
|
101
|
+
input_len = seq_len
|
94
102
|
logits_ptr = logits.data.data_ptr()
|
95
103
|
logits_ptr = ctypes.cast(logits_ptr, ctypes.POINTER(ctypes.c_float))
|
96
|
-
_lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr,
|
104
|
+
_lib.run_prefill_with_logits(model_ptr, input_ptr, input_len, logits_ptr,
|
105
|
+
vocab_size, (input_ids is None))
|
97
106
|
return logits
|
98
107
|
|
99
108
|
|
@@ -34,6 +34,10 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
|
|
34
34
|
lm_head_n_splits = 1
|
35
35
|
asym = getattr(model.config, "asym", False)
|
36
36
|
|
37
|
+
if vocab_size == 151666:
|
38
|
+
# for MiniCPM-V 2.6 lm_head on NPU
|
39
|
+
vocab_size = 152064
|
40
|
+
|
37
41
|
if not isinstance(lm_head, SlicedLMHead):
|
38
42
|
asym = lm_head.qtype == "asym_int4_rtn"
|
39
43
|
if asym:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ipex-llm
|
3
|
-
Version: 2.2.
|
3
|
+
Version: 2.2.0b20250102
|
4
4
|
Summary: Large Language Model Develop Toolkit
|
5
5
|
Home-page: https://github.com/intel-analytics/ipex-llm
|
6
6
|
Author: BigDL Authors
|
@@ -27,10 +27,10 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
|
|
27
27
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
|
28
28
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
|
29
29
|
Provides-Extra: cpp
|
30
|
-
Requires-Dist: bigdl-core-cpp ==2.6.
|
30
|
+
Requires-Dist: bigdl-core-cpp ==2.6.0b20250102 ; extra == 'cpp'
|
31
31
|
Requires-Dist: setuptools ; extra == 'cpp'
|
32
32
|
Provides-Extra: cpp-arl
|
33
|
-
Requires-Dist: bigdl-core-cpp ==2.6.
|
33
|
+
Requires-Dist: bigdl-core-cpp ==2.6.0b20250102 ; extra == 'cpp-arl'
|
34
34
|
Requires-Dist: setuptools ; extra == 'cpp-arl'
|
35
35
|
Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
36
36
|
Requires-Dist: dpcpp-cpp-rt ==2024.2.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
|
@@ -65,7 +65,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
|
|
65
65
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
|
66
66
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
|
67
67
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
|
68
|
-
Requires-Dist: bigdl-core-npu ==2.6.
|
68
|
+
Requires-Dist: bigdl-core-npu ==2.6.0b20250102 ; (platform_system == "Windows") and extra == 'npu'
|
69
69
|
Provides-Extra: serving
|
70
70
|
Requires-Dist: py-cpuinfo ; extra == 'serving'
|
71
71
|
Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
|
@@ -85,9 +85,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
|
|
85
85
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
|
86
86
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
|
87
87
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
|
88
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
89
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
90
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
88
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250102 ; extra == 'xpu'
|
89
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250102 ; extra == 'xpu'
|
90
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250102 ; extra == 'xpu'
|
91
91
|
Provides-Extra: xpu-2-1
|
92
92
|
Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
|
93
93
|
Requires-Dist: protobuf ; extra == 'xpu-2-1'
|
@@ -102,9 +102,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
|
|
102
102
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
|
103
103
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
|
104
104
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
|
105
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
106
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
107
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
105
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250102 ; extra == 'xpu-2-1'
|
106
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250102 ; extra == 'xpu-2-1'
|
107
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250102 ; extra == 'xpu-2-1'
|
108
108
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
|
109
109
|
Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
110
110
|
Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
@@ -119,9 +119,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
|
|
119
119
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
|
120
120
|
Requires-Dist: tabulate ; extra == 'xpu-arc'
|
121
121
|
Requires-Dist: setuptools ; extra == 'xpu-arc'
|
122
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
123
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
124
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
122
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250102 ; extra == 'xpu-arc'
|
123
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250102 ; extra == 'xpu-arc'
|
124
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250102 ; extra == 'xpu-arc'
|
125
125
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
|
126
126
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
127
127
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
@@ -141,9 +141,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
|
|
141
141
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
|
142
142
|
Requires-Dist: tabulate ; extra == 'xpu-arl'
|
143
143
|
Requires-Dist: setuptools ; extra == 'xpu-arl'
|
144
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
145
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
146
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
144
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250102 ; extra == 'xpu-arl'
|
145
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250102 ; extra == 'xpu-arl'
|
146
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250102 ; extra == 'xpu-arl'
|
147
147
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
|
148
148
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
149
149
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
@@ -163,9 +163,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
|
|
163
163
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
|
164
164
|
Requires-Dist: tabulate ; extra == 'xpu-lnl'
|
165
165
|
Requires-Dist: setuptools ; extra == 'xpu-lnl'
|
166
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
167
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
168
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
166
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250102 ; extra == 'xpu-lnl'
|
167
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250102 ; extra == 'xpu-lnl'
|
168
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250102 ; extra == 'xpu-lnl'
|
169
169
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
|
170
170
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|
171
171
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|
@@ -2,7 +2,7 @@ ipex_llm/__init__.py,sha256=kSA9JjVLPlpN4YchWtfOybRh4XiP6d_VTYvzbAouPSU,2118
|
|
2
2
|
ipex_llm/convert_model.py,sha256=jopEe6wu88ZPZfNFhgnQUu7807iciiWW_EMyTsVni5A,6816
|
3
3
|
ipex_llm/llm_patching.py,sha256=becMYcawtR8lgl2yeRQhvvX6CLaq09WZGm9dDmLJWL0,3232
|
4
4
|
ipex_llm/models.py,sha256=XROP6GLLrGQDlogGXpXZENbV143YNi6j0VPJeOdQ3Cg,1063
|
5
|
-
ipex_llm/optimize.py,sha256=
|
5
|
+
ipex_llm/optimize.py,sha256=ml-qEpzsrWGcd-Wia6IxPBC1PhqT5pi_lp3VTOH_ns0,12415
|
6
6
|
ipex_llm/cli/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
|
7
7
|
ipex_llm/cli/llm-chat.ps1,sha256=6qrs-hGVAV8IKh7Jx8nq_XrnZcjd7qGU5wndArM7Yag,2769
|
8
8
|
ipex_llm/cli/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
|
@@ -94,17 +94,17 @@ ipex_llm/serving/fastchat/tgi_api_protocol.py,sha256=brT3k3-V0NJrU4fRqUwWjC0O3iO
|
|
94
94
|
ipex_llm/serving/fastchat/tgi_api_server.py,sha256=agNTAEiZPSuj3dEdIdYKwkoY0cXOUDX06DiM9VP2knQ,24418
|
95
95
|
ipex_llm/serving/fastchat/vllm_worker.py,sha256=ZLz2Q9GxJO6r_LOiP6epgCRjBGk-K4EB1SNEWSJp5DA,11091
|
96
96
|
ipex_llm/transformers/__init__.py,sha256=l4KkMkLe-pRC7b_kj6LCfeifgE-Uo33_Av_FwN9HnFA,1074
|
97
|
-
ipex_llm/transformers/convert.py,sha256=
|
97
|
+
ipex_llm/transformers/convert.py,sha256=V4KDyi-2FVWSYZAxe4PlAxGGZbauSbOCuqq56ME9yyQ,106461
|
98
98
|
ipex_llm/transformers/convert_ipex.py,sha256=iKXo0n8fVFTOA2fNYYrByMFK0dovL-kLd2sVDk88AlQ,14334
|
99
99
|
ipex_llm/transformers/embedding.py,sha256=bdgk59DvD4ZZyxRzewXOR7g56nThgO6uhIwk8QL7f-s,9299
|
100
100
|
ipex_llm/transformers/kv.py,sha256=k4TU18LlA-Sbq9WNNQnfuzu3RSFBwFhmaV3BcGN5bAo,19191
|
101
101
|
ipex_llm/transformers/lisa.py,sha256=F5WxbtXQ7RdKulj83h_2DnEIgKiKGZf7zvOmg6QBl2s,3289
|
102
102
|
ipex_llm/transformers/loader.py,sha256=cOgX93xOC-4dt01GTJ5wyd7PjZ8S43r4mctkR2YxVuw,6893
|
103
103
|
ipex_llm/transformers/lookup.py,sha256=c4ETIha6ZLbWvhcclSKRDdi5Ipuet4mfUnOkBa0E8kk,19607
|
104
|
-
ipex_llm/transformers/low_bit_linear.py,sha256=
|
105
|
-
ipex_llm/transformers/model.py,sha256=
|
104
|
+
ipex_llm/transformers/low_bit_linear.py,sha256=dyyYyCqw0GK8hzaUGanrg-uIhU1HTLEEbvbxXMlm-80,41668
|
105
|
+
ipex_llm/transformers/model.py,sha256=KcRjkauGg48BYrUBoUZaVMpg7Piuz5JrfIpVZd3EIjs,41105
|
106
106
|
ipex_llm/transformers/modelling_bigdl.py,sha256=7JpNVMuyq_OmtNUaMFMXdxPWZp2q0QHC02QeA-VTPOw,6709
|
107
|
-
ipex_llm/transformers/npu_model.py,sha256=
|
107
|
+
ipex_llm/transformers/npu_model.py,sha256=a1mkyc6EqD7AJhqbYzokGhFubNpt5trIMuZT_dQKlTk,37861
|
108
108
|
ipex_llm/transformers/patches.py,sha256=halPWm__ORh2fRFSIFPiCNg3LQBfrRkTPtmtRpBJCZQ,1286
|
109
109
|
ipex_llm/transformers/pipeline_parallel.py,sha256=uNZpOXljNmdoEYnP8U-VFiN4dRZb2piQbIf2bG9LQnE,49051
|
110
110
|
ipex_llm/transformers/qlora.py,sha256=jtPGsvWFjbTUGzDBCdfftnCis_0nJQNRpACSwXUbbGU,14943
|
@@ -143,10 +143,10 @@ ipex_llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py,sh
|
|
143
143
|
ipex_llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py,sha256=_AOGMV65XHxgTxIib7lgs49InopcecTzRwgtYR8NTUg,51084
|
144
144
|
ipex_llm/transformers/models/__init__.py,sha256=tp2DcVkKg1-QvdYk7DY7rZvQWCDQ4ZjU8NAQ7Fclrpg,584
|
145
145
|
ipex_llm/transformers/models/aquila.py,sha256=VZb5Drpo_fTxwcExZ397LygnsNPX2sVbie9_JeFudZI,5252
|
146
|
-
ipex_llm/transformers/models/baichuan.py,sha256=
|
146
|
+
ipex_llm/transformers/models/baichuan.py,sha256=oJCAEENSG8oQhJ-QPN2SiapARjAGdOM6nEbyCcYOMCo,19334
|
147
147
|
ipex_llm/transformers/models/bert.py,sha256=bJNic2pt1kph0kBwdK5MRGyWupFfx2Ts0V3D1L-5kWo,6085
|
148
148
|
ipex_llm/transformers/models/bloom.py,sha256=PxfzyYT-nFn3K5rZhTQjmcEjUUzAhUFzxIN4kzRlCuc,8103
|
149
|
-
ipex_llm/transformers/models/chatglm.py,sha256=
|
149
|
+
ipex_llm/transformers/models/chatglm.py,sha256=UHai1t2AUtGmF765_eHF8LUMVQzp_oCBx8TJB21WrHk,12597
|
150
150
|
ipex_llm/transformers/models/chatglm2.py,sha256=kfJThuKYb3unAB1XCzfop1iDW1gOkyFOjSr-lEjUdS0,24781
|
151
151
|
ipex_llm/transformers/models/chatglm4.py,sha256=AAhAFFDDas5DBQPfh2Mwl7a2v7taKf6xphoeeNNFaBI,16593
|
152
152
|
ipex_llm/transformers/models/chatglm4v.py,sha256=YRfuf9g1E0MQ_7wbHAOMvadFnO-j3LqI_k1SaRkDs0M,14055
|
@@ -174,8 +174,8 @@ ipex_llm/transformers/models/mpt.py,sha256=z02NwHogJZVh-Mk4sYoIzR90SFIKhoNN_-ifs
|
|
174
174
|
ipex_llm/transformers/models/phi.py,sha256=E6qz4EEuHIVGvaPo-wtLC5lz3iyMqTbAE_cRlcjQRKI,6670
|
175
175
|
ipex_llm/transformers/models/phi3.py,sha256=jkiadJ85ToHpymY5GOM6orWlnx6LKN8_-v1MUcfGWPg,15159
|
176
176
|
ipex_llm/transformers/models/phixtral.py,sha256=MDTMghcu7qAmZmRcUGqXXDXhSU3y_N59HRIXmlcjp5g,4890
|
177
|
-
ipex_llm/transformers/models/qwen.py,sha256=
|
178
|
-
ipex_llm/transformers/models/qwen2.py,sha256=
|
177
|
+
ipex_llm/transformers/models/qwen.py,sha256=XIJ_bLzediBURWU-OOS3H6WBIGXQue6jDdUHJsAabwY,19391
|
178
|
+
ipex_llm/transformers/models/qwen2.py,sha256=b49HO4GSudwGJ3n6uHVno1oo3DgRt3jOjtQnLOB3cdY,25530
|
179
179
|
ipex_llm/transformers/models/qwen2_moe.py,sha256=EA_OYxYAEgrvi7VpDW192AJXG9Fwe2aBtOAZPkOAJk4,19350
|
180
180
|
ipex_llm/transformers/models/qwen2_vl.py,sha256=jIm4yZSd751BkRqgj3wR1QBkDIh-TMCLAMM8SZ8n6Qo,13419
|
181
181
|
ipex_llm/transformers/models/qwen_vl.py,sha256=j7Nzzz2Qvynu9yrCXmoEfERjw43hXof5TbXIs7Ms-oY,17105
|
@@ -192,7 +192,7 @@ ipex_llm/transformers/npu_models/baichuan_mp.py,sha256=tHhO-0v5z6IhxsfzAPYWXVbLr
|
|
192
192
|
ipex_llm/transformers/npu_models/chatglm.py,sha256=YzpGLZ7ORt6qkwW9mCwZ_xhOAI8uHSDHJrmqWgNM234,10511
|
193
193
|
ipex_llm/transformers/npu_models/chatglm4.py,sha256=J4523DzhIzZxIvlf1V9qU4auzEGKvC80YqyxuCJygjw,9795
|
194
194
|
ipex_llm/transformers/npu_models/common.py,sha256=tTUJL7IxVrJSnXle6nla35wTUrBf2sOEt7Ya1qyMezY,4853
|
195
|
-
ipex_llm/transformers/npu_models/convert.py,sha256=
|
195
|
+
ipex_llm/transformers/npu_models/convert.py,sha256=FILSGnoltcR9FMrCkw0eOKh6p3sbBI5i0Ms8AsJc04E,25342
|
196
196
|
ipex_llm/transformers/npu_models/convert_mp.py,sha256=t7160V4MmYpnex2NfuLTcqoc1meGEXdYi4AAPotfbzk,24518
|
197
197
|
ipex_llm/transformers/npu_models/glm_edge.py,sha256=VsJex-6530h4ZQk35TxRe1MnttAHT41omE8LV47LgBE,6723
|
198
198
|
ipex_llm/transformers/npu_models/kv.py,sha256=2OSFO9Z6e4nGdVxXEM-Bq2qa_npYYbGmQt3lcCZxTlU,9201
|
@@ -205,7 +205,7 @@ ipex_llm/transformers/npu_models/minicpm_mp.py,sha256=0iCRWN9UIUQp5tSKyu-orpGCOx
|
|
205
205
|
ipex_llm/transformers/npu_models/minicpmv_mp.py,sha256=m11WT6s_H5wkFtlz7aHMOL9b_CoL_G5MhoL5te4la_Q,20147
|
206
206
|
ipex_llm/transformers/npu_models/mistral.py,sha256=iRdmIQI_bbbZxRCYRvnV4rWjX2t-6vkHNl1ICAsLoy4,10759
|
207
207
|
ipex_llm/transformers/npu_models/mp_models_base.py,sha256=rY-5tq8DfxRsiaIITl0PQOTiPLJnUm_5L-oWzbK12N8,28429
|
208
|
-
ipex_llm/transformers/npu_models/npu_llm_cpp.py,sha256=
|
208
|
+
ipex_llm/transformers/npu_models/npu_llm_cpp.py,sha256=B40sBujvy31ETFBgcYAf4CN23UuTCBEJVaxjIMaoEHk,4268
|
209
209
|
ipex_llm/transformers/npu_models/paraformer_mp.py,sha256=lGEjmKHW_Pk3BE3nqa1ZVgJ3P5p4lNp7p6wMV7KrtCU,37871
|
210
210
|
ipex_llm/transformers/npu_models/phi3.py,sha256=R-EuqHsTrPTX33HtCGAMFlRdXB_j5mH_7FDnj62JtNM,6555
|
211
211
|
ipex_llm/transformers/npu_models/phi3_v.py,sha256=EMZuTPkGfuDVp9c5BU1HyzXHWKswHRQ8bvQjzocIyHA,7737
|
@@ -220,7 +220,7 @@ ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256=953Gua2tFKLI
|
|
220
220
|
ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=MnvHRytLt3oy5jIPUBe8AeEJ6PtPWLbhQ5a9WqjZ1TQ,19905
|
221
221
|
ipex_llm/transformers/npu_pipeline_model/minicpm.py,sha256=MDMesYlVbECKdK0xxkt1LwHgpkJOO7ZwBExYAwMGQa0,20637
|
222
222
|
ipex_llm/transformers/npu_pipeline_model/pipeline_cpp.py,sha256=JNmodAMg_NQvDILug3E_fGXEh6cd3wsj4bvAzcd-vaU,2749
|
223
|
-
ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=
|
223
|
+
ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=3paMXr1viuztybhmVLqQ9XvM3EZbxncDuNSNwLF8OI0,14849
|
224
224
|
ipex_llm/utils/__init__.py,sha256=NdB_InYE65dNgW4ruEPUOlgKEO2ELcsJoqkP7O5kpog,1391
|
225
225
|
ipex_llm/utils/benchmark_util_4_29.py,sha256=OU1W1quiaiJGsg1pd3HM9O6PmVSaPA0HHE7R8hNTfmQ,258653
|
226
226
|
ipex_llm/utils/benchmark_util_4_42.py,sha256=HEiClCgKDp_T64HH8ulSTly8dvt6UwPDYZfrPVYvXcc,225383
|
@@ -251,11 +251,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
|
|
251
251
|
ipex_llm/vllm/xpu/engine/engine.py,sha256=k4-D27WS_Gk3mA--w3HWAjPjb4Aiu043MVPi0ZoAUBc,5984
|
252
252
|
ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=GshTZFB8e4PWvqckfbmTOU6b0oLkNn7A-vzLuG9--j8,21544
|
253
253
|
ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=2rENA2ucynMaIjiZBEh2ez1o5vR32GaP514t39CD7KM,8676
|
254
|
-
ipex_llm-2.2.
|
255
|
-
ipex_llm-2.2.
|
256
|
-
ipex_llm-2.2.
|
257
|
-
ipex_llm-2.2.
|
258
|
-
ipex_llm-2.2.
|
259
|
-
ipex_llm-2.2.
|
260
|
-
ipex_llm-2.2.
|
261
|
-
ipex_llm-2.2.
|
254
|
+
ipex_llm-2.2.0b20250102.data/scripts/ipex-llm-init,sha256=fLQsT2dRL6H5bThb4GuIWotAuqoLsIxFwA-0c2qmaO8,6672
|
255
|
+
ipex_llm-2.2.0b20250102.data/scripts/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
|
256
|
+
ipex_llm-2.2.0b20250102.data/scripts/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
|
257
|
+
ipex_llm-2.2.0b20250102.dist-info/METADATA,sha256=fF_EkmZQW5wODRZlaJEQgQnS6Xieiem4h1vZcvsRxRE,11374
|
258
|
+
ipex_llm-2.2.0b20250102.dist-info/WHEEL,sha256=PPJcBMAZibF_2GFE9NmOJGqiaSMPiNFbJd6QaJjdA6Y,109
|
259
|
+
ipex_llm-2.2.0b20250102.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
|
260
|
+
ipex_llm-2.2.0b20250102.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
|
261
|
+
ipex_llm-2.2.0b20250102.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|