PyPI - ipex-llm - Versions diffs - 2.2.0b20250106__py3-none-win_amd64.whl → 2.2.0b20250106.post1__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250106__py3-none-win_amd64.whl → 2.2.0b20250106.post1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/convert.py +19 -158
ipex_llm/transformers/loader.py +1 -1
ipex_llm/transformers/lookup.py +2 -2
ipex_llm/transformers/low_bit_linear.py +15 -29
ipex_llm/transformers/model.py +0 -7
ipex_llm/transformers/models/chatglm2.py +1 -192
ipex_llm/transformers/models/minicpmv.py +2 -2
ipex_llm/transformers/models/sd.py +2 -2
ipex_llm/transformers/models/utils.py +16 -104
ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +5 -8
ipex_llm/transformers/speculative.py +2 -14
ipex_llm/transformers/utils.py +7 -20
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/METADATA +40 -19
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/RECORD +49 -53
ipex_llm/transformers/models/cohere.py +0 -589
ipex_llm/transformers/models/falcon.py +0 -829
ipex_llm/transformers/models/gptj.py +0 -441
ipex_llm/transformers/models/mixtral.py +0 -576
{ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250106.post1.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250106.post1.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250106.post1.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250106.post1.dist-info}/top_level.txt +0 -0

ipex_llm/libs/bloom-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/bloom.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama.dll CHANGED Viewed

Binary file

ipex_llm/libs/main-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/pipeline.dll CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/starcoder-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/starcoder.dll CHANGED Viewed

Binary file

ipex_llm/transformers/convert.py CHANGED Viewed

@@ -680,18 +680,9 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                             optimize_lm_head=optimize_lm_head
                         )
                     device = module.weight.data.device
-                    from ipex_llm.transformers.utils import get_ipex_version
-                    if get_ipex_version() < "2.1.10+xpu":
-                        new_linear._parameters['weight'] = nn.Parameter(module.weight)
-                    else:
-                        # only from 2.1, ipex provides matmul_bias_out
-                        # so we need to transpose weight
-                        new_weight = module.weight.transpose(0, 1).contiguous()
-                        new_linear._parameters['weight'] = nn.Parameter(new_weight)
-                        new_linear.weight_type = 2
+                    new_linear._parameters['weight'] = nn.Parameter(module.weight)
                     if module.bias is not None:
-                        new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\
-                            .to(device)
+                        new_linear._parameters['bias'] = nn.Parameter(module.bias.data).to(device)
                 elif qtype == ggml_tensor_qtype["bf16"]:
                     module.to(torch.bfloat16)
                     if _USE_VLLM:
@@ -1052,7 +1043,8 @@ def _optimize_pre(model, qtype=None):
         _optimize_pre(model.llm, qtype=qtype)
         model.llm.config.model_type = "megrezo"
     elif model.config.model_type == "chatglm":
-        if hasattr(model.config, 'padded_vocab_size') and model.config.padded_vocab_size == 65024:
+        if hasattr(model.config, 'padded_vocab_size') and \
+                model.config.padded_vocab_size in [65024, 64896]:
             # chatglm2 and chatglm3
             from ipex_llm.transformers.models.chatglm2 import split_mlp
             model.apply(split_mlp)
@@ -1337,7 +1329,7 @@ def _optimize_post(model):
         and model.config.architectures[0] in ["ChatGLMModel", "ChatGLMForConditionalGeneration"]
     ):
         if hasattr(model.config, 'padded_vocab_size') and \
-                model.config.padded_vocab_size == 65024:
+                model.config.padded_vocab_size in [65024, 64896]:
             # chatglm2-6b, chatglm2-6b-32k, chatglm3-6b, chatglm3-6b-32k, chatglm3-6b-128k
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
@@ -1359,27 +1351,9 @@ def _optimize_post(model):
                             module.RMSNorm,
                             chatglm_rms_norm_forward)
             convert_forward(model, module.MLP, mlp_forward)
-        elif hasattr(model.config, 'padded_vocab_size') and \
-                model.config.padded_vocab_size == 64896:
-            # codegeex-nano
-            modeling_module_name = model.__class__.__module__
-            module = importlib.import_module(modeling_module_name)
-            from ipex_llm.transformers.models.chatglm2 import codegeex_attention_forward
-            from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
-            from ipex_llm.transformers.models.chatglm2 import chatglm2_encoder_forward
-            from ipex_llm.transformers.models.chatglm2 import codegeex_model_forward
-            convert_forward(model,
-                            module.SelfAttention,
-                            codegeex_attention_forward)
-            convert_forward(model,
-                            module.GLMTransformer,
-                            chatglm2_encoder_forward)
-            convert_forward(model,
-                            module.ChatGLMModel,
-                            codegeex_model_forward)
-            convert_forward(model,
-                            module.RMSNorm,
-                            chatglm_rms_norm_forward)
+            # for codegeex-nano
+            if hasattr(model.config, "rope_ratio"):
+                model.transformer.rotary_pos_emb.rope_ratio = model.config.rope_ratio
         elif hasattr(model.config, 'vocab_size') and model.config.vocab_size == 130528:
             # chatglm-6b
             modeling_module_name = model.__class__.__module__
@@ -1469,21 +1443,6 @@ def _optimize_post(model):
                             module.MultiheadAttention,
                             mpt_multihead_attention_forward
                             )
-    elif "gptj" in model.config.model_type:
-        # dolly-v1-6b
-        modeling_module_name = model.__class__.__module__
-        module = importlib.import_module(modeling_module_name)
-        from ipex_llm.transformers.models.gptj import gptj_attention_forward, gptj_model_forward,\
-            gptj_block_forward
-        convert_forward(model,
-                        module.GPTJAttention,
-                        gptj_attention_forward)
-        convert_forward(model,
-                        module.GPTJModel,
-                        gptj_model_forward)
-        convert_forward(model,
-                        module.GPTJBlock,
-                        gptj_block_forward)
     elif "bloom" in model.config.model_type:
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
@@ -1492,44 +1451,6 @@ def _optimize_post(model):
                         module.BloomAttention,
                         bloom_attention_forward
                         )
-    elif "falcon" in model.config.model_type or "RefinedWeb" in model.config.model_type:
-        if model.config.architectures is not None:
-            modeling_module_name = model.__class__.__module__
-            module = importlib.import_module(modeling_module_name)
-            if "RWForCausalLM" in model.config.architectures:
-                if model.config.hidden_size == 4544:
-                    # falcon-7b need to check performance drop after kv cache support.
-                    # from ipex_llm.transformers.models.falcon import rw_attention_forward_7b
-                    # convert_forward(model,
-                    #                 module.Attention,
-                    #                 rw_attention_forward_7b
-                    #                 )
-                    pass
-                else:
-                    # falcon-40b
-                    from ipex_llm.transformers.models.falcon import rw_attention_forward_40b
-                    convert_forward(model,
-                                    module.Attention,
-                                    rw_attention_forward_40b
-                                    )
-            elif "FalconForCausalLM" in model.config.architectures:
-                if model.config.hidden_size != 4544:
-                    # falcon-180b and new falcon-40b
-                    if version.parse(trans_version) >= version.parse("4.36.0"):
-                        # transformers version >= 4.36.0
-                        from ipex_llm.transformers.models.falcon import \
-                            falcon_attention_forward_4_36
-                        convert_forward(model,
-                                        module.FalconAttention,
-                                        falcon_attention_forward_4_36
-                                        )
-                    else:
-                        from ipex_llm.transformers.models.falcon import falcon_attention_forward
-                        convert_forward(model,
-                                        module.FalconAttention,
-                                        falcon_attention_forward
-                                        )
     elif model.config.model_type == "baichuan":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
@@ -1748,31 +1669,6 @@ def _optimize_post(model):
         convert_forward(model, module.VisionAttention, qwen2_vision_attention_forward)
         convert_forward(model, module.Qwen2VLModel, qwen2_vl_model_forward)
         convert_forward(model, module.Qwen2VLAttention, qwen2_vl_attention_forward)
-    elif model.config.model_type == "cohere":
-        # for CohereForAI/c4ai-command-r-v01
-        invalidInputError(version.parse(trans_version) >= version.parse("4.40.0"),
-                          "Please upgrade transformers to 4.40.0 or higher version "
-                          "to run Mixtral models.")
-        modeling_module_name = model.__class__.__module__
-        module = importlib.import_module(modeling_module_name)
-        if version.parse(trans_version) >= version.parse("4.41.0"):
-            from ipex_llm.transformers.models.cohere import cohere_model_forward_4_41
-            convert_forward(model,
-                            module.CohereModel,
-                            cohere_model_forward_4_41)
-        else:
-            from ipex_llm.transformers.models.cohere import cohere_model_forward
-            convert_forward(model,
-                            module.CohereModel,
-                            cohere_model_forward)
-        from ipex_llm.transformers.models.cohere import cohere_attention_forward
-        convert_forward(model,
-                        module.CohereAttention,
-                        cohere_attention_forward)
-        convert_forward(model,
-                        module.CohereMLP,
-                        mlp_silu_forward)
     elif model.config.model_type == "aquila":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
@@ -1784,31 +1680,6 @@ def _optimize_post(model):
         convert_forward(model,
                         module.AquilaRMSNorm,
                         rms_norm_forward)
-    elif model.config.model_type == "mixtral":
-        # For mistralai/Mixtral-8x7B-v0.1
-        invalidInputError(version.parse(trans_version) >= version.parse("4.36.0"),
-                          "Please upgrade transformers to 4.36.0 or higher version "
-                          "to run Mixtral models.")
-        modeling_module_name = model.__class__.__module__
-        module = importlib.import_module(modeling_module_name)
-        from ipex_llm.transformers.models.mixtral import mixtral_moeblock_forward, \
-            mixtral_attention_forward, mixtral_mlp_forward, mixtral_model_forward
-        convert_forward(model,
-                        module.MixtralAttention,
-                        mixtral_attention_forward)
-        convert_forward(model,
-                        module.MixtralRMSNorm,
-                        rms_norm_forward)
-        convert_forward(model,
-                        module.MixtralSparseMoeBlock,
-                        mixtral_moeblock_forward)
-        convert_forward(model,
-                        module.MixtralBLockSparseTop2MLP,
-                        mixtral_mlp_forward)
-        convert_forward(model,
-                        module.MixtralModel,
-                        mixtral_model_forward)
     elif model.config.model_type == "phi-msft" and \
             hasattr(model.config, "num_local_experts"):
         # For phixtral, limit the condition to avoid applying on phi-2 hosted by ModelScope
@@ -1823,29 +1694,19 @@ def _optimize_post(model):
                         module.MLP,
                         phixtral_mlp_forward)
     elif model.config.model_type == "mistral":
-        if model.config.architectures is not None and \
-                model.config.architectures[0] == "MixtralForCausalLM":
-            # For DiscoResearch/mixtral-7b-8expert
-            invalidInputError(version.parse(trans_version) >= version.parse("4.36.0"),
-                              "Please upgrade transformers to 4.36.0 or higher version "
-                              "to run Mixtral models.")
-            modeling_module_name = model.__class__.__module__
-            module = importlib.import_module(modeling_module_name)
-            convert_forward(model, module.MistralRMSNorm, rms_norm_forward)
-        else:
-            modeling_module_name = model.__class__.__module__
-            module = importlib.import_module(modeling_module_name)
+        modeling_module_name = model.__class__.__module__
+        module = importlib.import_module(modeling_module_name)
-            from ipex_llm.transformers.models.mistral import mistral_model_forward
-            from ipex_llm.transformers.models.mistral import mistral_attention_forward
-            from ipex_llm.transformers.models.common import rms_norm_forward
-            from ipex_llm.transformers.models.common import mlp_silu_forward
+        from ipex_llm.transformers.models.mistral import mistral_model_forward
+        from ipex_llm.transformers.models.mistral import mistral_attention_forward
+        from ipex_llm.transformers.models.common import rms_norm_forward
+        from ipex_llm.transformers.models.common import mlp_silu_forward
-            convert_forward(model, module.MistralModel, mistral_model_forward)
-            convert_forward(model, module.MistralAttention, mistral_attention_forward)
-            convert_forward(model, module.MistralSdpaAttention, mistral_attention_forward)
-            convert_forward(model, module.MistralRMSNorm, rms_norm_forward)
-            convert_forward(model, module.MistralMLP, mlp_silu_forward)
+        convert_forward(model, module.MistralModel, mistral_model_forward)
+        convert_forward(model, module.MistralAttention, mistral_attention_forward)
+        convert_forward(model, module.MistralSdpaAttention, mistral_attention_forward)
+        convert_forward(model, module.MistralRMSNorm, rms_norm_forward)
+        convert_forward(model, module.MistralMLP, mlp_silu_forward)
     elif model.config.model_type == "gemma":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)

ipex_llm/transformers/loader.py CHANGED Viewed

@@ -22,7 +22,7 @@ import time
 from datetime import date
 import argparse
 from ipex_llm.utils.common import invalidInputError
-from transformers import AutoTokenizer, GPTJForCausalLM, LlamaTokenizer
+from transformers import AutoTokenizer, LlamaTokenizer
 LLAMA_IDS = ['llama', 'vicuna', 'merged-baize']

ipex_llm/transformers/lookup.py CHANGED Viewed

@@ -33,7 +33,7 @@ from ipex_llm.transformers.speculative import greedy, deepmind_sample, logits_to
     _crop_past_key_values, _prepare_generate_args, _non_cpu_ipex_verify, clear_benchmarks,\
     _prepare_generate_args_4_45
 from ipex_llm.utils.common import invalidInputError
-from ipex_llm.transformers.utils import get_xpu_device_type
+from ipex_llm.transformers.utils import get_xpu_device_name
 logger = logging.getLogger("ipex_llm.lookup")
@@ -295,7 +295,7 @@ def lookup_generate(self,
     invalidInputError(input_ids.shape[0] == 1,
                       "Prompt lookup is currently not supported with batch inference.")
-    device_name = get_xpu_device_type(input_ids)
+    device_name = get_xpu_device_name(input_ids.device)
     candidates_generator = PromptLookupCandidateGenerator(
         num_output_tokens=num_output_tokens,

ipex_llm/transformers/low_bit_linear.py CHANGED Viewed

@@ -51,7 +51,7 @@ from torch import Tensor, device, dtype, nn
 from operator import mul
 from functools import reduce
 from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
-from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_type, \
+from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_name, \
     get_ipex_version
 from ipex_llm.transformers.convert import is_deepspeed_available, get_use_vllm
@@ -266,7 +266,7 @@ def reshape_lm_head_input(x):
 def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
-    device = get_xpu_device_type(x)
+    device_name = get_xpu_device_name(x.device)
     batch_size = x.shape[0]
     hard_condition = (
         x.dtype in [torch.float, torch.half]
@@ -286,7 +286,7 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
             or (
                 qtype in [SYM_INT8, FP4, FP6, Q4_K, Q6_K]
                 and batch_size <= 48
-                and device in ["arc", "flex", "pvc", "mtl"]
+                and device_name in ["arc", "pvc", "mtl", "arl"]
                 and x.shape[1] % 256 == 0
                 and output_len % 32 == 0
             )
@@ -295,8 +295,8 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
     if hard_condition:
         return (
             batch_size > 1
-            or (device in ["arc", "flex"] and qtype in [SYM_INT8, FP4])
-            or (device in ["arc", "flex", "mtl"] and qtype in [FP8E4])
+            or (device in ["arc"] and qtype in [SYM_INT8, FP4])
+            or (device in ["arc", "mtl"] and qtype in [FP8E4])
             or (device in ["lnl"] and qtype in [SYM_INT4] and x.shape[1] % 512 == 0)
             or (device in ["bmg"] and qtype in [SYM_INT4, FP8E5])
         )
@@ -603,7 +603,7 @@ class LowBitLinear(nn.Linear):
         # empty cache before and after lm_head at first token when input > 1024
         # on arc or IPEX_LLM_LOW_MEM is set to 1 at inference time.
         if self.device is None:
-            self.device = get_xpu_device_type(self.weight.data)
+            self.device = get_xpu_device_name(self.weight.data.device)
             self.low_memory_mode = \
                 self.low_memory_mode and \
                 (self.device == "arc" or os.environ.get("IPEX_LLM_LOW_MEM", None) == "1")
@@ -759,9 +759,9 @@ class FP16Linear(nn.Linear):
         self.weight_length = self.out_len * self.in_len
         self.qtype = ggml_tensor_qtype["fp16"]
         self.mp_group = mp_group
-        # weigh_type = 1 means original weight
-        # weigh_type = 2 means weight has been transposed
-        # weigh_type = 3 means weight has been transposed by esimd method
+        # weight_type = 1 means original weight
+        # weight_type = 2 means weight has been transposed
+        # weight_type = 3 means weight has been transposed by esimd method
         self.weight_type = 1
         self.optimize_lm_head = optimize_lm_head
         self.disable_fp16_opt = False
@@ -775,28 +775,14 @@ class FP16Linear(nn.Linear):
         x = x.to(torch.float16)
         if self.bias is not None and self.bias.dtype != x.dtype:
-                self.bias.data = self.bias.data.to(x.dtype)
+            self.bias.data = self.bias.data.to(x.dtype)
         if self.weight is not None and self.weight.dtype != x.dtype:
             self.weight.data = self.weight.data.to(x.dtype)
         if not self.use_esimd_kernel(x):
-            if (
-                get_ipex_version() < "2.1.10+xpu"
-                or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]
-                or self.disable_fp16_opt
-            ):
-                if self.weight_type == 2:
-                    self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
-                                                     requires_grad=False)
-                    self.weight_type = 1
-                result = F.linear(x, self.weight, self.bias)
-            else:
-                if self.weight_type == 1:
-                    self.weight = torch.nn.Parameter(self.weight.transpose(0, 1).contiguous(),
-                                                     requires_grad=False)
-                    self.weight_type = 2
-                result = torch.ops.torch_ipex.matmul_bias_out(x.contiguous(),
-                                                              self.weight, self.bias)
+            invalidInputError(self.weight_type == 1, "weight_type should be 1")
+            result = F.linear(x, self.weight, self.bias)
             if self.mp_group is not None:
                 if get_use_vllm():
                     result = self.mp_group.all_reduce(result)
@@ -848,11 +834,11 @@ class FP16Linear(nn.Linear):
             return result.to(x.dtype)
     def use_esimd_kernel(self, x):
-        gpu_type = get_xpu_device_type(x)
+        gpu_type = get_xpu_device_name(x.device)
         if self.disable_fp16_opt:
             return False
         # esimd kernel can only be used for Arc and Flex
-        if gpu_type not in ["arc", "flex"]:
+        if gpu_type not in ["arc"]:
             return False
         # now esimd kernel can only be used for specific cases (llama2-7b shape)
         if self.in_len == 11008 and self.out_features == 4096:

ipex_llm/transformers/model.py CHANGED Viewed

@@ -103,12 +103,6 @@ def save_low_bit(self, *args, **kwargs):
         self.to(origin_device)
-def _load_pre():
-    from transformers import GPTJModel
-    from ipex_llm.transformers.models.gptj import gptj_model_new_init
-    GPTJModel.__init__ = gptj_model_new_init
 class _BaseAutoModelClass:
     HF_MODEL = None
@@ -495,7 +489,6 @@ class _BaseAutoModelClass:
         else:
             if quant_config is not None:
                 kwargs["quantization_config"] = quant_config
-            _load_pre()
             try:
                 # To handle the input CUDA setting (such as 'device_map={"":0}'), ignore it
                 kwargs.pop('device_map', None)