ipex-llm 2.2.0b20250106__py3-none-win_amd64.whl → 2.2.0b20250106.post1__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/convert.py +19 -158
- ipex_llm/transformers/loader.py +1 -1
- ipex_llm/transformers/lookup.py +2 -2
- ipex_llm/transformers/low_bit_linear.py +15 -29
- ipex_llm/transformers/model.py +0 -7
- ipex_llm/transformers/models/chatglm2.py +1 -192
- ipex_llm/transformers/models/minicpmv.py +2 -2
- ipex_llm/transformers/models/sd.py +2 -2
- ipex_llm/transformers/models/utils.py +16 -104
- ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +5 -8
- ipex_llm/transformers/speculative.py +2 -14
- ipex_llm/transformers/utils.py +7 -20
- {ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/METADATA +40 -19
- {ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/RECORD +49 -53
- ipex_llm/transformers/models/cohere.py +0 -589
- ipex_llm/transformers/models/falcon.py +0 -829
- ipex_llm/transformers/models/gptj.py +0 -441
- ipex_llm/transformers/models/mixtral.py +0 -576
- {ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250106.post1.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250106.post1.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250106.post1.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/top_level.txt +0 -0
ipex_llm/libs/bloom-api.dll
CHANGED
Binary file
|
ipex_llm/libs/bloom.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox-api.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_vnni.dll
CHANGED
Binary file
|
ipex_llm/libs/libgptneox_avx.dll
CHANGED
Binary file
|
Binary file
|
ipex_llm/libs/libllama_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libllama_vnni.dll
CHANGED
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/llama-api.dll
CHANGED
Binary file
|
ipex_llm/libs/llama.dll
CHANGED
Binary file
|
ipex_llm/libs/main-bloom.exe
CHANGED
Binary file
|
ipex_llm/libs/main-gptneox.exe
CHANGED
Binary file
|
ipex_llm/libs/main-llama.exe
CHANGED
Binary file
|
ipex_llm/libs/main-starcoder.exe
CHANGED
Binary file
|
ipex_llm/libs/pipeline.dll
CHANGED
Binary file
|
ipex_llm/libs/quantize-bloom.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/quantize-llama.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/starcoder-api.dll
CHANGED
Binary file
|
ipex_llm/libs/starcoder.dll
CHANGED
Binary file
|
ipex_llm/transformers/convert.py
CHANGED
@@ -680,18 +680,9 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
|
|
680
680
|
optimize_lm_head=optimize_lm_head
|
681
681
|
)
|
682
682
|
device = module.weight.data.device
|
683
|
-
|
684
|
-
if get_ipex_version() < "2.1.10+xpu":
|
685
|
-
new_linear._parameters['weight'] = nn.Parameter(module.weight)
|
686
|
-
else:
|
687
|
-
# only from 2.1, ipex provides matmul_bias_out
|
688
|
-
# so we need to transpose weight
|
689
|
-
new_weight = module.weight.transpose(0, 1).contiguous()
|
690
|
-
new_linear._parameters['weight'] = nn.Parameter(new_weight)
|
691
|
-
new_linear.weight_type = 2
|
683
|
+
new_linear._parameters['weight'] = nn.Parameter(module.weight)
|
692
684
|
if module.bias is not None:
|
693
|
-
new_linear._parameters['bias'] = nn.Parameter(module.bias.data)
|
694
|
-
.to(device)
|
685
|
+
new_linear._parameters['bias'] = nn.Parameter(module.bias.data).to(device)
|
695
686
|
elif qtype == ggml_tensor_qtype["bf16"]:
|
696
687
|
module.to(torch.bfloat16)
|
697
688
|
if _USE_VLLM:
|
@@ -1052,7 +1043,8 @@ def _optimize_pre(model, qtype=None):
|
|
1052
1043
|
_optimize_pre(model.llm, qtype=qtype)
|
1053
1044
|
model.llm.config.model_type = "megrezo"
|
1054
1045
|
elif model.config.model_type == "chatglm":
|
1055
|
-
if hasattr(model.config, 'padded_vocab_size') and
|
1046
|
+
if hasattr(model.config, 'padded_vocab_size') and \
|
1047
|
+
model.config.padded_vocab_size in [65024, 64896]:
|
1056
1048
|
# chatglm2 and chatglm3
|
1057
1049
|
from ipex_llm.transformers.models.chatglm2 import split_mlp
|
1058
1050
|
model.apply(split_mlp)
|
@@ -1337,7 +1329,7 @@ def _optimize_post(model):
|
|
1337
1329
|
and model.config.architectures[0] in ["ChatGLMModel", "ChatGLMForConditionalGeneration"]
|
1338
1330
|
):
|
1339
1331
|
if hasattr(model.config, 'padded_vocab_size') and \
|
1340
|
-
model.config.padded_vocab_size
|
1332
|
+
model.config.padded_vocab_size in [65024, 64896]:
|
1341
1333
|
# chatglm2-6b, chatglm2-6b-32k, chatglm3-6b, chatglm3-6b-32k, chatglm3-6b-128k
|
1342
1334
|
modeling_module_name = model.__class__.__module__
|
1343
1335
|
module = importlib.import_module(modeling_module_name)
|
@@ -1359,27 +1351,9 @@ def _optimize_post(model):
|
|
1359
1351
|
module.RMSNorm,
|
1360
1352
|
chatglm_rms_norm_forward)
|
1361
1353
|
convert_forward(model, module.MLP, mlp_forward)
|
1362
|
-
|
1363
|
-
|
1364
|
-
|
1365
|
-
modeling_module_name = model.__class__.__module__
|
1366
|
-
module = importlib.import_module(modeling_module_name)
|
1367
|
-
from ipex_llm.transformers.models.chatglm2 import codegeex_attention_forward
|
1368
|
-
from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
|
1369
|
-
from ipex_llm.transformers.models.chatglm2 import chatglm2_encoder_forward
|
1370
|
-
from ipex_llm.transformers.models.chatglm2 import codegeex_model_forward
|
1371
|
-
convert_forward(model,
|
1372
|
-
module.SelfAttention,
|
1373
|
-
codegeex_attention_forward)
|
1374
|
-
convert_forward(model,
|
1375
|
-
module.GLMTransformer,
|
1376
|
-
chatglm2_encoder_forward)
|
1377
|
-
convert_forward(model,
|
1378
|
-
module.ChatGLMModel,
|
1379
|
-
codegeex_model_forward)
|
1380
|
-
convert_forward(model,
|
1381
|
-
module.RMSNorm,
|
1382
|
-
chatglm_rms_norm_forward)
|
1354
|
+
# for codegeex-nano
|
1355
|
+
if hasattr(model.config, "rope_ratio"):
|
1356
|
+
model.transformer.rotary_pos_emb.rope_ratio = model.config.rope_ratio
|
1383
1357
|
elif hasattr(model.config, 'vocab_size') and model.config.vocab_size == 130528:
|
1384
1358
|
# chatglm-6b
|
1385
1359
|
modeling_module_name = model.__class__.__module__
|
@@ -1469,21 +1443,6 @@ def _optimize_post(model):
|
|
1469
1443
|
module.MultiheadAttention,
|
1470
1444
|
mpt_multihead_attention_forward
|
1471
1445
|
)
|
1472
|
-
elif "gptj" in model.config.model_type:
|
1473
|
-
# dolly-v1-6b
|
1474
|
-
modeling_module_name = model.__class__.__module__
|
1475
|
-
module = importlib.import_module(modeling_module_name)
|
1476
|
-
from ipex_llm.transformers.models.gptj import gptj_attention_forward, gptj_model_forward,\
|
1477
|
-
gptj_block_forward
|
1478
|
-
convert_forward(model,
|
1479
|
-
module.GPTJAttention,
|
1480
|
-
gptj_attention_forward)
|
1481
|
-
convert_forward(model,
|
1482
|
-
module.GPTJModel,
|
1483
|
-
gptj_model_forward)
|
1484
|
-
convert_forward(model,
|
1485
|
-
module.GPTJBlock,
|
1486
|
-
gptj_block_forward)
|
1487
1446
|
elif "bloom" in model.config.model_type:
|
1488
1447
|
modeling_module_name = model.__class__.__module__
|
1489
1448
|
module = importlib.import_module(modeling_module_name)
|
@@ -1492,44 +1451,6 @@ def _optimize_post(model):
|
|
1492
1451
|
module.BloomAttention,
|
1493
1452
|
bloom_attention_forward
|
1494
1453
|
)
|
1495
|
-
elif "falcon" in model.config.model_type or "RefinedWeb" in model.config.model_type:
|
1496
|
-
if model.config.architectures is not None:
|
1497
|
-
modeling_module_name = model.__class__.__module__
|
1498
|
-
module = importlib.import_module(modeling_module_name)
|
1499
|
-
if "RWForCausalLM" in model.config.architectures:
|
1500
|
-
if model.config.hidden_size == 4544:
|
1501
|
-
# falcon-7b need to check performance drop after kv cache support.
|
1502
|
-
# from ipex_llm.transformers.models.falcon import rw_attention_forward_7b
|
1503
|
-
# convert_forward(model,
|
1504
|
-
# module.Attention,
|
1505
|
-
# rw_attention_forward_7b
|
1506
|
-
# )
|
1507
|
-
pass
|
1508
|
-
else:
|
1509
|
-
# falcon-40b
|
1510
|
-
from ipex_llm.transformers.models.falcon import rw_attention_forward_40b
|
1511
|
-
convert_forward(model,
|
1512
|
-
module.Attention,
|
1513
|
-
rw_attention_forward_40b
|
1514
|
-
)
|
1515
|
-
elif "FalconForCausalLM" in model.config.architectures:
|
1516
|
-
if model.config.hidden_size != 4544:
|
1517
|
-
# falcon-180b and new falcon-40b
|
1518
|
-
if version.parse(trans_version) >= version.parse("4.36.0"):
|
1519
|
-
# transformers version >= 4.36.0
|
1520
|
-
from ipex_llm.transformers.models.falcon import \
|
1521
|
-
falcon_attention_forward_4_36
|
1522
|
-
|
1523
|
-
convert_forward(model,
|
1524
|
-
module.FalconAttention,
|
1525
|
-
falcon_attention_forward_4_36
|
1526
|
-
)
|
1527
|
-
else:
|
1528
|
-
from ipex_llm.transformers.models.falcon import falcon_attention_forward
|
1529
|
-
convert_forward(model,
|
1530
|
-
module.FalconAttention,
|
1531
|
-
falcon_attention_forward
|
1532
|
-
)
|
1533
1454
|
elif model.config.model_type == "baichuan":
|
1534
1455
|
modeling_module_name = model.__class__.__module__
|
1535
1456
|
module = importlib.import_module(modeling_module_name)
|
@@ -1748,31 +1669,6 @@ def _optimize_post(model):
|
|
1748
1669
|
convert_forward(model, module.VisionAttention, qwen2_vision_attention_forward)
|
1749
1670
|
convert_forward(model, module.Qwen2VLModel, qwen2_vl_model_forward)
|
1750
1671
|
convert_forward(model, module.Qwen2VLAttention, qwen2_vl_attention_forward)
|
1751
|
-
elif model.config.model_type == "cohere":
|
1752
|
-
# for CohereForAI/c4ai-command-r-v01
|
1753
|
-
invalidInputError(version.parse(trans_version) >= version.parse("4.40.0"),
|
1754
|
-
"Please upgrade transformers to 4.40.0 or higher version "
|
1755
|
-
"to run Mixtral models.")
|
1756
|
-
modeling_module_name = model.__class__.__module__
|
1757
|
-
module = importlib.import_module(modeling_module_name)
|
1758
|
-
if version.parse(trans_version) >= version.parse("4.41.0"):
|
1759
|
-
from ipex_llm.transformers.models.cohere import cohere_model_forward_4_41
|
1760
|
-
convert_forward(model,
|
1761
|
-
module.CohereModel,
|
1762
|
-
cohere_model_forward_4_41)
|
1763
|
-
else:
|
1764
|
-
from ipex_llm.transformers.models.cohere import cohere_model_forward
|
1765
|
-
convert_forward(model,
|
1766
|
-
module.CohereModel,
|
1767
|
-
cohere_model_forward)
|
1768
|
-
|
1769
|
-
from ipex_llm.transformers.models.cohere import cohere_attention_forward
|
1770
|
-
convert_forward(model,
|
1771
|
-
module.CohereAttention,
|
1772
|
-
cohere_attention_forward)
|
1773
|
-
convert_forward(model,
|
1774
|
-
module.CohereMLP,
|
1775
|
-
mlp_silu_forward)
|
1776
1672
|
elif model.config.model_type == "aquila":
|
1777
1673
|
modeling_module_name = model.__class__.__module__
|
1778
1674
|
module = importlib.import_module(modeling_module_name)
|
@@ -1784,31 +1680,6 @@ def _optimize_post(model):
|
|
1784
1680
|
convert_forward(model,
|
1785
1681
|
module.AquilaRMSNorm,
|
1786
1682
|
rms_norm_forward)
|
1787
|
-
elif model.config.model_type == "mixtral":
|
1788
|
-
# For mistralai/Mixtral-8x7B-v0.1
|
1789
|
-
invalidInputError(version.parse(trans_version) >= version.parse("4.36.0"),
|
1790
|
-
"Please upgrade transformers to 4.36.0 or higher version "
|
1791
|
-
"to run Mixtral models.")
|
1792
|
-
modeling_module_name = model.__class__.__module__
|
1793
|
-
module = importlib.import_module(modeling_module_name)
|
1794
|
-
from ipex_llm.transformers.models.mixtral import mixtral_moeblock_forward, \
|
1795
|
-
mixtral_attention_forward, mixtral_mlp_forward, mixtral_model_forward
|
1796
|
-
convert_forward(model,
|
1797
|
-
module.MixtralAttention,
|
1798
|
-
mixtral_attention_forward)
|
1799
|
-
convert_forward(model,
|
1800
|
-
module.MixtralRMSNorm,
|
1801
|
-
rms_norm_forward)
|
1802
|
-
convert_forward(model,
|
1803
|
-
module.MixtralSparseMoeBlock,
|
1804
|
-
mixtral_moeblock_forward)
|
1805
|
-
convert_forward(model,
|
1806
|
-
module.MixtralBLockSparseTop2MLP,
|
1807
|
-
mixtral_mlp_forward)
|
1808
|
-
convert_forward(model,
|
1809
|
-
module.MixtralModel,
|
1810
|
-
mixtral_model_forward)
|
1811
|
-
|
1812
1683
|
elif model.config.model_type == "phi-msft" and \
|
1813
1684
|
hasattr(model.config, "num_local_experts"):
|
1814
1685
|
# For phixtral, limit the condition to avoid applying on phi-2 hosted by ModelScope
|
@@ -1823,29 +1694,19 @@ def _optimize_post(model):
|
|
1823
1694
|
module.MLP,
|
1824
1695
|
phixtral_mlp_forward)
|
1825
1696
|
elif model.config.model_type == "mistral":
|
1826
|
-
|
1827
|
-
|
1828
|
-
# For DiscoResearch/mixtral-7b-8expert
|
1829
|
-
invalidInputError(version.parse(trans_version) >= version.parse("4.36.0"),
|
1830
|
-
"Please upgrade transformers to 4.36.0 or higher version "
|
1831
|
-
"to run Mixtral models.")
|
1832
|
-
modeling_module_name = model.__class__.__module__
|
1833
|
-
module = importlib.import_module(modeling_module_name)
|
1834
|
-
convert_forward(model, module.MistralRMSNorm, rms_norm_forward)
|
1835
|
-
else:
|
1836
|
-
modeling_module_name = model.__class__.__module__
|
1837
|
-
module = importlib.import_module(modeling_module_name)
|
1697
|
+
modeling_module_name = model.__class__.__module__
|
1698
|
+
module = importlib.import_module(modeling_module_name)
|
1838
1699
|
|
1839
|
-
|
1840
|
-
|
1841
|
-
|
1842
|
-
|
1700
|
+
from ipex_llm.transformers.models.mistral import mistral_model_forward
|
1701
|
+
from ipex_llm.transformers.models.mistral import mistral_attention_forward
|
1702
|
+
from ipex_llm.transformers.models.common import rms_norm_forward
|
1703
|
+
from ipex_llm.transformers.models.common import mlp_silu_forward
|
1843
1704
|
|
1844
|
-
|
1845
|
-
|
1846
|
-
|
1847
|
-
|
1848
|
-
|
1705
|
+
convert_forward(model, module.MistralModel, mistral_model_forward)
|
1706
|
+
convert_forward(model, module.MistralAttention, mistral_attention_forward)
|
1707
|
+
convert_forward(model, module.MistralSdpaAttention, mistral_attention_forward)
|
1708
|
+
convert_forward(model, module.MistralRMSNorm, rms_norm_forward)
|
1709
|
+
convert_forward(model, module.MistralMLP, mlp_silu_forward)
|
1849
1710
|
elif model.config.model_type == "gemma":
|
1850
1711
|
modeling_module_name = model.__class__.__module__
|
1851
1712
|
module = importlib.import_module(modeling_module_name)
|
ipex_llm/transformers/loader.py
CHANGED
@@ -22,7 +22,7 @@ import time
|
|
22
22
|
from datetime import date
|
23
23
|
import argparse
|
24
24
|
from ipex_llm.utils.common import invalidInputError
|
25
|
-
from transformers import AutoTokenizer,
|
25
|
+
from transformers import AutoTokenizer, LlamaTokenizer
|
26
26
|
|
27
27
|
LLAMA_IDS = ['llama', 'vicuna', 'merged-baize']
|
28
28
|
|
ipex_llm/transformers/lookup.py
CHANGED
@@ -33,7 +33,7 @@ from ipex_llm.transformers.speculative import greedy, deepmind_sample, logits_to
|
|
33
33
|
_crop_past_key_values, _prepare_generate_args, _non_cpu_ipex_verify, clear_benchmarks,\
|
34
34
|
_prepare_generate_args_4_45
|
35
35
|
from ipex_llm.utils.common import invalidInputError
|
36
|
-
from ipex_llm.transformers.utils import
|
36
|
+
from ipex_llm.transformers.utils import get_xpu_device_name
|
37
37
|
|
38
38
|
logger = logging.getLogger("ipex_llm.lookup")
|
39
39
|
|
@@ -295,7 +295,7 @@ def lookup_generate(self,
|
|
295
295
|
invalidInputError(input_ids.shape[0] == 1,
|
296
296
|
"Prompt lookup is currently not supported with batch inference.")
|
297
297
|
|
298
|
-
device_name =
|
298
|
+
device_name = get_xpu_device_name(input_ids.device)
|
299
299
|
|
300
300
|
candidates_generator = PromptLookupCandidateGenerator(
|
301
301
|
num_output_tokens=num_output_tokens,
|
@@ -51,7 +51,7 @@ from torch import Tensor, device, dtype, nn
|
|
51
51
|
from operator import mul
|
52
52
|
from functools import reduce
|
53
53
|
from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
|
54
|
-
from ipex_llm.transformers.utils import get_autocast_dtype,
|
54
|
+
from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_name, \
|
55
55
|
get_ipex_version
|
56
56
|
from ipex_llm.transformers.convert import is_deepspeed_available, get_use_vllm
|
57
57
|
|
@@ -266,7 +266,7 @@ def reshape_lm_head_input(x):
|
|
266
266
|
|
267
267
|
|
268
268
|
def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
|
269
|
-
|
269
|
+
device_name = get_xpu_device_name(x.device)
|
270
270
|
batch_size = x.shape[0]
|
271
271
|
hard_condition = (
|
272
272
|
x.dtype in [torch.float, torch.half]
|
@@ -286,7 +286,7 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
|
|
286
286
|
or (
|
287
287
|
qtype in [SYM_INT8, FP4, FP6, Q4_K, Q6_K]
|
288
288
|
and batch_size <= 48
|
289
|
-
and
|
289
|
+
and device_name in ["arc", "pvc", "mtl", "arl"]
|
290
290
|
and x.shape[1] % 256 == 0
|
291
291
|
and output_len % 32 == 0
|
292
292
|
)
|
@@ -295,8 +295,8 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
|
|
295
295
|
if hard_condition:
|
296
296
|
return (
|
297
297
|
batch_size > 1
|
298
|
-
or (device in ["arc"
|
299
|
-
or (device in ["arc", "
|
298
|
+
or (device in ["arc"] and qtype in [SYM_INT8, FP4])
|
299
|
+
or (device in ["arc", "mtl"] and qtype in [FP8E4])
|
300
300
|
or (device in ["lnl"] and qtype in [SYM_INT4] and x.shape[1] % 512 == 0)
|
301
301
|
or (device in ["bmg"] and qtype in [SYM_INT4, FP8E5])
|
302
302
|
)
|
@@ -603,7 +603,7 @@ class LowBitLinear(nn.Linear):
|
|
603
603
|
# empty cache before and after lm_head at first token when input > 1024
|
604
604
|
# on arc or IPEX_LLM_LOW_MEM is set to 1 at inference time.
|
605
605
|
if self.device is None:
|
606
|
-
self.device =
|
606
|
+
self.device = get_xpu_device_name(self.weight.data.device)
|
607
607
|
self.low_memory_mode = \
|
608
608
|
self.low_memory_mode and \
|
609
609
|
(self.device == "arc" or os.environ.get("IPEX_LLM_LOW_MEM", None) == "1")
|
@@ -759,9 +759,9 @@ class FP16Linear(nn.Linear):
|
|
759
759
|
self.weight_length = self.out_len * self.in_len
|
760
760
|
self.qtype = ggml_tensor_qtype["fp16"]
|
761
761
|
self.mp_group = mp_group
|
762
|
-
#
|
763
|
-
#
|
764
|
-
#
|
762
|
+
# weight_type = 1 means original weight
|
763
|
+
# weight_type = 2 means weight has been transposed
|
764
|
+
# weight_type = 3 means weight has been transposed by esimd method
|
765
765
|
self.weight_type = 1
|
766
766
|
self.optimize_lm_head = optimize_lm_head
|
767
767
|
self.disable_fp16_opt = False
|
@@ -775,28 +775,14 @@ class FP16Linear(nn.Linear):
|
|
775
775
|
|
776
776
|
x = x.to(torch.float16)
|
777
777
|
if self.bias is not None and self.bias.dtype != x.dtype:
|
778
|
-
|
778
|
+
self.bias.data = self.bias.data.to(x.dtype)
|
779
779
|
if self.weight is not None and self.weight.dtype != x.dtype:
|
780
780
|
self.weight.data = self.weight.data.to(x.dtype)
|
781
781
|
|
782
782
|
if not self.use_esimd_kernel(x):
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
or self.disable_fp16_opt
|
787
|
-
):
|
788
|
-
if self.weight_type == 2:
|
789
|
-
self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
|
790
|
-
requires_grad=False)
|
791
|
-
self.weight_type = 1
|
792
|
-
result = F.linear(x, self.weight, self.bias)
|
793
|
-
else:
|
794
|
-
if self.weight_type == 1:
|
795
|
-
self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
|
796
|
-
requires_grad=False)
|
797
|
-
self.weight_type = 2
|
798
|
-
result = torch.ops.torch_ipex.matmul_bias_out(x.contiguous(),
|
799
|
-
self.weight, self.bias)
|
783
|
+
invalidInputError(self.weight_type == 1, "weight_type should be 1")
|
784
|
+
result = F.linear(x, self.weight, self.bias)
|
785
|
+
|
800
786
|
if self.mp_group is not None:
|
801
787
|
if get_use_vllm():
|
802
788
|
result = self.mp_group.all_reduce(result)
|
@@ -848,11 +834,11 @@ class FP16Linear(nn.Linear):
|
|
848
834
|
return result.to(x.dtype)
|
849
835
|
|
850
836
|
def use_esimd_kernel(self, x):
|
851
|
-
gpu_type =
|
837
|
+
gpu_type = get_xpu_device_name(x.device)
|
852
838
|
if self.disable_fp16_opt:
|
853
839
|
return False
|
854
840
|
# esimd kernel can only be used for Arc and Flex
|
855
|
-
if gpu_type not in ["arc"
|
841
|
+
if gpu_type not in ["arc"]:
|
856
842
|
return False
|
857
843
|
# now esimd kernel can only be used for specific cases (llama2-7b shape)
|
858
844
|
if self.in_len == 11008 and self.out_features == 4096:
|
ipex_llm/transformers/model.py
CHANGED
@@ -103,12 +103,6 @@ def save_low_bit(self, *args, **kwargs):
|
|
103
103
|
self.to(origin_device)
|
104
104
|
|
105
105
|
|
106
|
-
def _load_pre():
|
107
|
-
from transformers import GPTJModel
|
108
|
-
from ipex_llm.transformers.models.gptj import gptj_model_new_init
|
109
|
-
GPTJModel.__init__ = gptj_model_new_init
|
110
|
-
|
111
|
-
|
112
106
|
class _BaseAutoModelClass:
|
113
107
|
HF_MODEL = None
|
114
108
|
|
@@ -495,7 +489,6 @@ class _BaseAutoModelClass:
|
|
495
489
|
else:
|
496
490
|
if quant_config is not None:
|
497
491
|
kwargs["quantization_config"] = quant_config
|
498
|
-
_load_pre()
|
499
492
|
try:
|
500
493
|
# To handle the input CUDA setting (such as 'device_map={"":0}'), ignore it
|
501
494
|
kwargs.pop('device_map', None)
|