ipex-llm 2.2.0b20250106__py3-none-win_amd64.whl → 2.2.0b20250106.post1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/convert.py +19 -158
  31. ipex_llm/transformers/loader.py +1 -1
  32. ipex_llm/transformers/lookup.py +2 -2
  33. ipex_llm/transformers/low_bit_linear.py +15 -29
  34. ipex_llm/transformers/model.py +0 -7
  35. ipex_llm/transformers/models/chatglm2.py +1 -192
  36. ipex_llm/transformers/models/minicpmv.py +2 -2
  37. ipex_llm/transformers/models/sd.py +2 -2
  38. ipex_llm/transformers/models/utils.py +16 -104
  39. ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +5 -8
  40. ipex_llm/transformers/speculative.py +2 -14
  41. ipex_llm/transformers/utils.py +7 -20
  42. {ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/METADATA +40 -19
  43. {ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/RECORD +49 -53
  44. ipex_llm/transformers/models/cohere.py +0 -589
  45. ipex_llm/transformers/models/falcon.py +0 -829
  46. ipex_llm/transformers/models/gptj.py +0 -441
  47. ipex_llm/transformers/models/mixtral.py +0 -576
  48. {ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250106.post1.data}/scripts/ipex-llm-init.bat +0 -0
  49. {ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250106.post1.data}/scripts/llm-chat.ps1 +0 -0
  50. {ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250106.post1.data}/scripts/llm-cli.ps1 +0 -0
  51. {ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/WHEEL +0 -0
  52. {ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/entry_points.txt +0 -0
  53. {ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/top_level.txt +0 -0
Binary file
ipex_llm/libs/bloom.dll CHANGED
Binary file
Binary file
ipex_llm/libs/gptneox.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
ipex_llm/libs/llama.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -680,18 +680,9 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
680
680
  optimize_lm_head=optimize_lm_head
681
681
  )
682
682
  device = module.weight.data.device
683
- from ipex_llm.transformers.utils import get_ipex_version
684
- if get_ipex_version() < "2.1.10+xpu":
685
- new_linear._parameters['weight'] = nn.Parameter(module.weight)
686
- else:
687
- # only from 2.1, ipex provides matmul_bias_out
688
- # so we need to transpose weight
689
- new_weight = module.weight.transpose(0, 1).contiguous()
690
- new_linear._parameters['weight'] = nn.Parameter(new_weight)
691
- new_linear.weight_type = 2
683
+ new_linear._parameters['weight'] = nn.Parameter(module.weight)
692
684
  if module.bias is not None:
693
- new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
694
- .to(device)
685
+ new_linear._parameters['bias'] = nn.Parameter(module.bias.data).to(device)
695
686
  elif qtype == ggml_tensor_qtype["bf16"]:
696
687
  module.to(torch.bfloat16)
697
688
  if _USE_VLLM:
@@ -1052,7 +1043,8 @@ def _optimize_pre(model, qtype=None):
1052
1043
  _optimize_pre(model.llm, qtype=qtype)
1053
1044
  model.llm.config.model_type = "megrezo"
1054
1045
  elif model.config.model_type == "chatglm":
1055
- if hasattr(model.config, 'padded_vocab_size') and model.config.padded_vocab_size == 65024:
1046
+ if hasattr(model.config, 'padded_vocab_size') and \
1047
+ model.config.padded_vocab_size in [65024, 64896]:
1056
1048
  # chatglm2 and chatglm3
1057
1049
  from ipex_llm.transformers.models.chatglm2 import split_mlp
1058
1050
  model.apply(split_mlp)
@@ -1337,7 +1329,7 @@ def _optimize_post(model):
1337
1329
  and model.config.architectures[0] in ["ChatGLMModel", "ChatGLMForConditionalGeneration"]
1338
1330
  ):
1339
1331
  if hasattr(model.config, 'padded_vocab_size') and \
1340
- model.config.padded_vocab_size == 65024:
1332
+ model.config.padded_vocab_size in [65024, 64896]:
1341
1333
  # chatglm2-6b, chatglm2-6b-32k, chatglm3-6b, chatglm3-6b-32k, chatglm3-6b-128k
1342
1334
  modeling_module_name = model.__class__.__module__
1343
1335
  module = importlib.import_module(modeling_module_name)
@@ -1359,27 +1351,9 @@ def _optimize_post(model):
1359
1351
  module.RMSNorm,
1360
1352
  chatglm_rms_norm_forward)
1361
1353
  convert_forward(model, module.MLP, mlp_forward)
1362
- elif hasattr(model.config, 'padded_vocab_size') and \
1363
- model.config.padded_vocab_size == 64896:
1364
- # codegeex-nano
1365
- modeling_module_name = model.__class__.__module__
1366
- module = importlib.import_module(modeling_module_name)
1367
- from ipex_llm.transformers.models.chatglm2 import codegeex_attention_forward
1368
- from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
1369
- from ipex_llm.transformers.models.chatglm2 import chatglm2_encoder_forward
1370
- from ipex_llm.transformers.models.chatglm2 import codegeex_model_forward
1371
- convert_forward(model,
1372
- module.SelfAttention,
1373
- codegeex_attention_forward)
1374
- convert_forward(model,
1375
- module.GLMTransformer,
1376
- chatglm2_encoder_forward)
1377
- convert_forward(model,
1378
- module.ChatGLMModel,
1379
- codegeex_model_forward)
1380
- convert_forward(model,
1381
- module.RMSNorm,
1382
- chatglm_rms_norm_forward)
1354
+ # for codegeex-nano
1355
+ if hasattr(model.config, "rope_ratio"):
1356
+ model.transformer.rotary_pos_emb.rope_ratio = model.config.rope_ratio
1383
1357
  elif hasattr(model.config, 'vocab_size') and model.config.vocab_size == 130528:
1384
1358
  # chatglm-6b
1385
1359
  modeling_module_name = model.__class__.__module__
@@ -1469,21 +1443,6 @@ def _optimize_post(model):
1469
1443
  module.MultiheadAttention,
1470
1444
  mpt_multihead_attention_forward
1471
1445
  )
1472
- elif "gptj" in model.config.model_type:
1473
- # dolly-v1-6b
1474
- modeling_module_name = model.__class__.__module__
1475
- module = importlib.import_module(modeling_module_name)
1476
- from ipex_llm.transformers.models.gptj import gptj_attention_forward, gptj_model_forward,\
1477
- gptj_block_forward
1478
- convert_forward(model,
1479
- module.GPTJAttention,
1480
- gptj_attention_forward)
1481
- convert_forward(model,
1482
- module.GPTJModel,
1483
- gptj_model_forward)
1484
- convert_forward(model,
1485
- module.GPTJBlock,
1486
- gptj_block_forward)
1487
1446
  elif "bloom" in model.config.model_type:
1488
1447
  modeling_module_name = model.__class__.__module__
1489
1448
  module = importlib.import_module(modeling_module_name)
@@ -1492,44 +1451,6 @@ def _optimize_post(model):
1492
1451
  module.BloomAttention,
1493
1452
  bloom_attention_forward
1494
1453
  )
1495
- elif "falcon" in model.config.model_type or "RefinedWeb" in model.config.model_type:
1496
- if model.config.architectures is not None:
1497
- modeling_module_name = model.__class__.__module__
1498
- module = importlib.import_module(modeling_module_name)
1499
- if "RWForCausalLM" in model.config.architectures:
1500
- if model.config.hidden_size == 4544:
1501
- # falcon-7b need to check performance drop after kv cache support.
1502
- # from ipex_llm.transformers.models.falcon import rw_attention_forward_7b
1503
- # convert_forward(model,
1504
- # module.Attention,
1505
- # rw_attention_forward_7b
1506
- # )
1507
- pass
1508
- else:
1509
- # falcon-40b
1510
- from ipex_llm.transformers.models.falcon import rw_attention_forward_40b
1511
- convert_forward(model,
1512
- module.Attention,
1513
- rw_attention_forward_40b
1514
- )
1515
- elif "FalconForCausalLM" in model.config.architectures:
1516
- if model.config.hidden_size != 4544:
1517
- # falcon-180b and new falcon-40b
1518
- if version.parse(trans_version) >= version.parse("4.36.0"):
1519
- # transformers version >= 4.36.0
1520
- from ipex_llm.transformers.models.falcon import \
1521
- falcon_attention_forward_4_36
1522
-
1523
- convert_forward(model,
1524
- module.FalconAttention,
1525
- falcon_attention_forward_4_36
1526
- )
1527
- else:
1528
- from ipex_llm.transformers.models.falcon import falcon_attention_forward
1529
- convert_forward(model,
1530
- module.FalconAttention,
1531
- falcon_attention_forward
1532
- )
1533
1454
  elif model.config.model_type == "baichuan":
1534
1455
  modeling_module_name = model.__class__.__module__
1535
1456
  module = importlib.import_module(modeling_module_name)
@@ -1748,31 +1669,6 @@ def _optimize_post(model):
1748
1669
  convert_forward(model, module.VisionAttention, qwen2_vision_attention_forward)
1749
1670
  convert_forward(model, module.Qwen2VLModel, qwen2_vl_model_forward)
1750
1671
  convert_forward(model, module.Qwen2VLAttention, qwen2_vl_attention_forward)
1751
- elif model.config.model_type == "cohere":
1752
- # for CohereForAI/c4ai-command-r-v01
1753
- invalidInputError(version.parse(trans_version) >= version.parse("4.40.0"),
1754
- "Please upgrade transformers to 4.40.0 or higher version "
1755
- "to run Mixtral models.")
1756
- modeling_module_name = model.__class__.__module__
1757
- module = importlib.import_module(modeling_module_name)
1758
- if version.parse(trans_version) >= version.parse("4.41.0"):
1759
- from ipex_llm.transformers.models.cohere import cohere_model_forward_4_41
1760
- convert_forward(model,
1761
- module.CohereModel,
1762
- cohere_model_forward_4_41)
1763
- else:
1764
- from ipex_llm.transformers.models.cohere import cohere_model_forward
1765
- convert_forward(model,
1766
- module.CohereModel,
1767
- cohere_model_forward)
1768
-
1769
- from ipex_llm.transformers.models.cohere import cohere_attention_forward
1770
- convert_forward(model,
1771
- module.CohereAttention,
1772
- cohere_attention_forward)
1773
- convert_forward(model,
1774
- module.CohereMLP,
1775
- mlp_silu_forward)
1776
1672
  elif model.config.model_type == "aquila":
1777
1673
  modeling_module_name = model.__class__.__module__
1778
1674
  module = importlib.import_module(modeling_module_name)
@@ -1784,31 +1680,6 @@ def _optimize_post(model):
1784
1680
  convert_forward(model,
1785
1681
  module.AquilaRMSNorm,
1786
1682
  rms_norm_forward)
1787
- elif model.config.model_type == "mixtral":
1788
- # For mistralai/Mixtral-8x7B-v0.1
1789
- invalidInputError(version.parse(trans_version) >= version.parse("4.36.0"),
1790
- "Please upgrade transformers to 4.36.0 or higher version "
1791
- "to run Mixtral models.")
1792
- modeling_module_name = model.__class__.__module__
1793
- module = importlib.import_module(modeling_module_name)
1794
- from ipex_llm.transformers.models.mixtral import mixtral_moeblock_forward, \
1795
- mixtral_attention_forward, mixtral_mlp_forward, mixtral_model_forward
1796
- convert_forward(model,
1797
- module.MixtralAttention,
1798
- mixtral_attention_forward)
1799
- convert_forward(model,
1800
- module.MixtralRMSNorm,
1801
- rms_norm_forward)
1802
- convert_forward(model,
1803
- module.MixtralSparseMoeBlock,
1804
- mixtral_moeblock_forward)
1805
- convert_forward(model,
1806
- module.MixtralBLockSparseTop2MLP,
1807
- mixtral_mlp_forward)
1808
- convert_forward(model,
1809
- module.MixtralModel,
1810
- mixtral_model_forward)
1811
-
1812
1683
  elif model.config.model_type == "phi-msft" and \
1813
1684
  hasattr(model.config, "num_local_experts"):
1814
1685
  # For phixtral, limit the condition to avoid applying on phi-2 hosted by ModelScope
@@ -1823,29 +1694,19 @@ def _optimize_post(model):
1823
1694
  module.MLP,
1824
1695
  phixtral_mlp_forward)
1825
1696
  elif model.config.model_type == "mistral":
1826
- if model.config.architectures is not None and \
1827
- model.config.architectures[0] == "MixtralForCausalLM":
1828
- # For DiscoResearch/mixtral-7b-8expert
1829
- invalidInputError(version.parse(trans_version) >= version.parse("4.36.0"),
1830
- "Please upgrade transformers to 4.36.0 or higher version "
1831
- "to run Mixtral models.")
1832
- modeling_module_name = model.__class__.__module__
1833
- module = importlib.import_module(modeling_module_name)
1834
- convert_forward(model, module.MistralRMSNorm, rms_norm_forward)
1835
- else:
1836
- modeling_module_name = model.__class__.__module__
1837
- module = importlib.import_module(modeling_module_name)
1697
+ modeling_module_name = model.__class__.__module__
1698
+ module = importlib.import_module(modeling_module_name)
1838
1699
 
1839
- from ipex_llm.transformers.models.mistral import mistral_model_forward
1840
- from ipex_llm.transformers.models.mistral import mistral_attention_forward
1841
- from ipex_llm.transformers.models.common import rms_norm_forward
1842
- from ipex_llm.transformers.models.common import mlp_silu_forward
1700
+ from ipex_llm.transformers.models.mistral import mistral_model_forward
1701
+ from ipex_llm.transformers.models.mistral import mistral_attention_forward
1702
+ from ipex_llm.transformers.models.common import rms_norm_forward
1703
+ from ipex_llm.transformers.models.common import mlp_silu_forward
1843
1704
 
1844
- convert_forward(model, module.MistralModel, mistral_model_forward)
1845
- convert_forward(model, module.MistralAttention, mistral_attention_forward)
1846
- convert_forward(model, module.MistralSdpaAttention, mistral_attention_forward)
1847
- convert_forward(model, module.MistralRMSNorm, rms_norm_forward)
1848
- convert_forward(model, module.MistralMLP, mlp_silu_forward)
1705
+ convert_forward(model, module.MistralModel, mistral_model_forward)
1706
+ convert_forward(model, module.MistralAttention, mistral_attention_forward)
1707
+ convert_forward(model, module.MistralSdpaAttention, mistral_attention_forward)
1708
+ convert_forward(model, module.MistralRMSNorm, rms_norm_forward)
1709
+ convert_forward(model, module.MistralMLP, mlp_silu_forward)
1849
1710
  elif model.config.model_type == "gemma":
1850
1711
  modeling_module_name = model.__class__.__module__
1851
1712
  module = importlib.import_module(modeling_module_name)
@@ -22,7 +22,7 @@ import time
22
22
  from datetime import date
23
23
  import argparse
24
24
  from ipex_llm.utils.common import invalidInputError
25
- from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
25
+ from transformers import AutoTokenizer, LlamaTokenizer
26
26
 
27
27
  LLAMA_IDS = ['llama', 'vicuna', 'merged-baize']
28
28
 
@@ -33,7 +33,7 @@ from ipex_llm.transformers.speculative import greedy, deepmind_sample, logits_to
33
33
  _crop_past_key_values, _prepare_generate_args, _non_cpu_ipex_verify, clear_benchmarks,\
34
34
  _prepare_generate_args_4_45
35
35
  from ipex_llm.utils.common import invalidInputError
36
- from ipex_llm.transformers.utils import get_xpu_device_type
36
+ from ipex_llm.transformers.utils import get_xpu_device_name
37
37
 
38
38
  logger = logging.getLogger("ipex_llm.lookup")
39
39
 
@@ -295,7 +295,7 @@ def lookup_generate(self,
295
295
  invalidInputError(input_ids.shape[0] == 1,
296
296
  "Prompt lookup is currently not supported with batch inference.")
297
297
 
298
- device_name = get_xpu_device_type(input_ids)
298
+ device_name = get_xpu_device_name(input_ids.device)
299
299
 
300
300
  candidates_generator = PromptLookupCandidateGenerator(
301
301
  num_output_tokens=num_output_tokens,
@@ -51,7 +51,7 @@ from torch import Tensor, device, dtype, nn
51
51
  from operator import mul
52
52
  from functools import reduce
53
53
  from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
54
- from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_type, \
54
+ from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_name, \
55
55
  get_ipex_version
56
56
  from ipex_llm.transformers.convert import is_deepspeed_available, get_use_vllm
57
57
 
@@ -266,7 +266,7 @@ def reshape_lm_head_input(x):
266
266
 
267
267
 
268
268
  def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
269
- device = get_xpu_device_type(x)
269
+ device_name = get_xpu_device_name(x.device)
270
270
  batch_size = x.shape[0]
271
271
  hard_condition = (
272
272
  x.dtype in [torch.float, torch.half]
@@ -286,7 +286,7 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
286
286
  or (
287
287
  qtype in [SYM_INT8, FP4, FP6, Q4_K, Q6_K]
288
288
  and batch_size <= 48
289
- and device in ["arc", "flex", "pvc", "mtl"]
289
+ and device_name in ["arc", "pvc", "mtl", "arl"]
290
290
  and x.shape[1] % 256 == 0
291
291
  and output_len % 32 == 0
292
292
  )
@@ -295,8 +295,8 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
295
295
  if hard_condition:
296
296
  return (
297
297
  batch_size > 1
298
- or (device in ["arc", "flex"] and qtype in [SYM_INT8, FP4])
299
- or (device in ["arc", "flex", "mtl"] and qtype in [FP8E4])
298
+ or (device in ["arc"] and qtype in [SYM_INT8, FP4])
299
+ or (device in ["arc", "mtl"] and qtype in [FP8E4])
300
300
  or (device in ["lnl"] and qtype in [SYM_INT4] and x.shape[1] % 512 == 0)
301
301
  or (device in ["bmg"] and qtype in [SYM_INT4, FP8E5])
302
302
  )
@@ -603,7 +603,7 @@ class LowBitLinear(nn.Linear):
603
603
  # empty cache before and after lm_head at first token when input > 1024
604
604
  # on arc or IPEX_LLM_LOW_MEM is set to 1 at inference time.
605
605
  if self.device is None:
606
- self.device = get_xpu_device_type(self.weight.data)
606
+ self.device = get_xpu_device_name(self.weight.data.device)
607
607
  self.low_memory_mode = \
608
608
  self.low_memory_mode and \
609
609
  (self.device == "arc" or os.environ.get("IPEX_LLM_LOW_MEM", None) == "1")
@@ -759,9 +759,9 @@ class FP16Linear(nn.Linear):
759
759
  self.weight_length = self.out_len * self.in_len
760
760
  self.qtype = ggml_tensor_qtype["fp16"]
761
761
  self.mp_group = mp_group
762
- # weigh_type = 1 means original weight
763
- # weigh_type = 2 means weight has been transposed
764
- # weigh_type = 3 means weight has been transposed by esimd method
762
+ # weight_type = 1 means original weight
763
+ # weight_type = 2 means weight has been transposed
764
+ # weight_type = 3 means weight has been transposed by esimd method
765
765
  self.weight_type = 1
766
766
  self.optimize_lm_head = optimize_lm_head
767
767
  self.disable_fp16_opt = False
@@ -775,28 +775,14 @@ class FP16Linear(nn.Linear):
775
775
 
776
776
  x = x.to(torch.float16)
777
777
  if self.bias is not None and self.bias.dtype != x.dtype:
778
- self.bias.data = self.bias.data.to(x.dtype)
778
+ self.bias.data = self.bias.data.to(x.dtype)
779
779
  if self.weight is not None and self.weight.dtype != x.dtype:
780
780
  self.weight.data = self.weight.data.to(x.dtype)
781
781
 
782
782
  if not self.use_esimd_kernel(x):
783
- if (
784
- get_ipex_version() < "2.1.10+xpu"
785
- or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]
786
- or self.disable_fp16_opt
787
- ):
788
- if self.weight_type == 2:
789
- self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
790
- requires_grad=False)
791
- self.weight_type = 1
792
- result = F.linear(x, self.weight, self.bias)
793
- else:
794
- if self.weight_type == 1:
795
- self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
796
- requires_grad=False)
797
- self.weight_type = 2
798
- result = torch.ops.torch_ipex.matmul_bias_out(x.contiguous(),
799
- self.weight, self.bias)
783
+ invalidInputError(self.weight_type == 1, "weight_type should be 1")
784
+ result = F.linear(x, self.weight, self.bias)
785
+
800
786
  if self.mp_group is not None:
801
787
  if get_use_vllm():
802
788
  result = self.mp_group.all_reduce(result)
@@ -848,11 +834,11 @@ class FP16Linear(nn.Linear):
848
834
  return result.to(x.dtype)
849
835
 
850
836
  def use_esimd_kernel(self, x):
851
- gpu_type = get_xpu_device_type(x)
837
+ gpu_type = get_xpu_device_name(x.device)
852
838
  if self.disable_fp16_opt:
853
839
  return False
854
840
  # esimd kernel can only be used for Arc and Flex
855
- if gpu_type not in ["arc", "flex"]:
841
+ if gpu_type not in ["arc"]:
856
842
  return False
857
843
  # now esimd kernel can only be used for specific cases (llama2-7b shape)
858
844
  if self.in_len == 11008 and self.out_features == 4096:
@@ -103,12 +103,6 @@ def save_low_bit(self, *args, **kwargs):
103
103
  self.to(origin_device)
104
104
 
105
105
 
106
- def _load_pre():
107
- from transformers import GPTJModel
108
- from ipex_llm.transformers.models.gptj import gptj_model_new_init
109
- GPTJModel.__init__ = gptj_model_new_init
110
-
111
-
112
106
  class _BaseAutoModelClass:
113
107
  HF_MODEL = None
114
108
 
@@ -495,7 +489,6 @@ class _BaseAutoModelClass:
495
489
  else:
496
490
  if quant_config is not None:
497
491
  kwargs["quantization_config"] = quant_config
498
- _load_pre()
499
492
  try:
500
493
  # To handle the input CUDA setting (such as 'device_map={"":0}'), ignore it
501
494
  kwargs.pop('device_map', None)