PyPI - ipex-llm - Versions diffs - 2.2.0b20250106__py3-none-win_amd64.whl → 2.2.0b20250107__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250106__py3-none-win_amd64.whl → 2.2.0b20250107__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/convert.py +17 -132
ipex_llm/transformers/lookup.py +2 -2
ipex_llm/transformers/low_bit_linear.py +8 -8
ipex_llm/transformers/models/chatglm2.py +1 -192
ipex_llm/transformers/models/minicpmv.py +2 -2
ipex_llm/transformers/models/sd.py +2 -2
ipex_llm/transformers/models/utils.py +14 -89
ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +5 -8
ipex_llm/transformers/utils.py +5 -20
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250107.dist-info}/METADATA +40 -19
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250107.dist-info}/RECORD +46 -49
ipex_llm/transformers/models/cohere.py +0 -589
ipex_llm/transformers/models/falcon.py +0 -829
ipex_llm/transformers/models/mixtral.py +0 -576
{ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250107.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250107.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250106.data → ipex_llm-2.2.0b20250107.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250107.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250107.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250106.dist-info → ipex_llm-2.2.0b20250107.dist-info}/top_level.txt +0 -0

ipex_llm/libs/bloom-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/bloom.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama.dll CHANGED Viewed

Binary file

ipex_llm/libs/main-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/pipeline.dll CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/starcoder-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/starcoder.dll CHANGED Viewed

Binary file

ipex_llm/transformers/convert.py CHANGED Viewed

@@ -1052,7 +1052,8 @@ def _optimize_pre(model, qtype=None):
         _optimize_pre(model.llm, qtype=qtype)
         model.llm.config.model_type = "megrezo"
     elif model.config.model_type == "chatglm":
-        if hasattr(model.config, 'padded_vocab_size') and model.config.padded_vocab_size == 65024:
+        if hasattr(model.config, 'padded_vocab_size') and \
+                model.config.padded_vocab_size in [65024, 64896]:
             # chatglm2 and chatglm3
             from ipex_llm.transformers.models.chatglm2 import split_mlp
             model.apply(split_mlp)
@@ -1337,7 +1338,7 @@ def _optimize_post(model):
         and model.config.architectures[0] in ["ChatGLMModel", "ChatGLMForConditionalGeneration"]
     ):
         if hasattr(model.config, 'padded_vocab_size') and \
-                model.config.padded_vocab_size == 65024:
+                model.config.padded_vocab_size in [65024, 64896]:
             # chatglm2-6b, chatglm2-6b-32k, chatglm3-6b, chatglm3-6b-32k, chatglm3-6b-128k
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
@@ -1359,27 +1360,9 @@ def _optimize_post(model):
                             module.RMSNorm,
                             chatglm_rms_norm_forward)
             convert_forward(model, module.MLP, mlp_forward)
-        elif hasattr(model.config, 'padded_vocab_size') and \
-                model.config.padded_vocab_size == 64896:
-            # codegeex-nano
-            modeling_module_name = model.__class__.__module__
-            module = importlib.import_module(modeling_module_name)
-            from ipex_llm.transformers.models.chatglm2 import codegeex_attention_forward
-            from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
-            from ipex_llm.transformers.models.chatglm2 import chatglm2_encoder_forward
-            from ipex_llm.transformers.models.chatglm2 import codegeex_model_forward
-            convert_forward(model,
-                            module.SelfAttention,
-                            codegeex_attention_forward)
-            convert_forward(model,
-                            module.GLMTransformer,
-                            chatglm2_encoder_forward)
-            convert_forward(model,
-                            module.ChatGLMModel,
-                            codegeex_model_forward)
-            convert_forward(model,
-                            module.RMSNorm,
-                            chatglm_rms_norm_forward)
+            # for codegeex-nano
+            if hasattr(model.config, "rope_ratio"):
+                model.transformer.rotary_pos_emb.rope_ratio = model.config.rope_ratio
         elif hasattr(model.config, 'vocab_size') and model.config.vocab_size == 130528:
             # chatglm-6b
             modeling_module_name = model.__class__.__module__
@@ -1492,44 +1475,6 @@ def _optimize_post(model):
                         module.BloomAttention,
                         bloom_attention_forward
                         )
-    elif "falcon" in model.config.model_type or "RefinedWeb" in model.config.model_type:
-        if model.config.architectures is not None:
-            modeling_module_name = model.__class__.__module__
-            module = importlib.import_module(modeling_module_name)
-            if "RWForCausalLM" in model.config.architectures:
-                if model.config.hidden_size == 4544:
-                    # falcon-7b need to check performance drop after kv cache support.
-                    # from ipex_llm.transformers.models.falcon import rw_attention_forward_7b
-                    # convert_forward(model,
-                    #                 module.Attention,
-                    #                 rw_attention_forward_7b
-                    #                 )
-                    pass
-                else:
-                    # falcon-40b
-                    from ipex_llm.transformers.models.falcon import rw_attention_forward_40b
-                    convert_forward(model,
-                                    module.Attention,
-                                    rw_attention_forward_40b
-                                    )
-            elif "FalconForCausalLM" in model.config.architectures:
-                if model.config.hidden_size != 4544:
-                    # falcon-180b and new falcon-40b
-                    if version.parse(trans_version) >= version.parse("4.36.0"):
-                        # transformers version >= 4.36.0
-                        from ipex_llm.transformers.models.falcon import \
-                            falcon_attention_forward_4_36
-                        convert_forward(model,
-                                        module.FalconAttention,
-                                        falcon_attention_forward_4_36
-                                        )
-                    else:
-                        from ipex_llm.transformers.models.falcon import falcon_attention_forward
-                        convert_forward(model,
-                                        module.FalconAttention,
-                                        falcon_attention_forward
-                                        )
     elif model.config.model_type == "baichuan":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
@@ -1748,31 +1693,6 @@ def _optimize_post(model):
         convert_forward(model, module.VisionAttention, qwen2_vision_attention_forward)
         convert_forward(model, module.Qwen2VLModel, qwen2_vl_model_forward)
         convert_forward(model, module.Qwen2VLAttention, qwen2_vl_attention_forward)
-    elif model.config.model_type == "cohere":
-        # for CohereForAI/c4ai-command-r-v01
-        invalidInputError(version.parse(trans_version) >= version.parse("4.40.0"),
-                          "Please upgrade transformers to 4.40.0 or higher version "
-                          "to run Mixtral models.")
-        modeling_module_name = model.__class__.__module__
-        module = importlib.import_module(modeling_module_name)
-        if version.parse(trans_version) >= version.parse("4.41.0"):
-            from ipex_llm.transformers.models.cohere import cohere_model_forward_4_41
-            convert_forward(model,
-                            module.CohereModel,
-                            cohere_model_forward_4_41)
-        else:
-            from ipex_llm.transformers.models.cohere import cohere_model_forward
-            convert_forward(model,
-                            module.CohereModel,
-                            cohere_model_forward)
-        from ipex_llm.transformers.models.cohere import cohere_attention_forward
-        convert_forward(model,
-                        module.CohereAttention,
-                        cohere_attention_forward)
-        convert_forward(model,
-                        module.CohereMLP,
-                        mlp_silu_forward)
     elif model.config.model_type == "aquila":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
@@ -1784,31 +1704,6 @@ def _optimize_post(model):
         convert_forward(model,
                         module.AquilaRMSNorm,
                         rms_norm_forward)
-    elif model.config.model_type == "mixtral":
-        # For mistralai/Mixtral-8x7B-v0.1
-        invalidInputError(version.parse(trans_version) >= version.parse("4.36.0"),
-                          "Please upgrade transformers to 4.36.0 or higher version "
-                          "to run Mixtral models.")
-        modeling_module_name = model.__class__.__module__
-        module = importlib.import_module(modeling_module_name)
-        from ipex_llm.transformers.models.mixtral import mixtral_moeblock_forward, \
-            mixtral_attention_forward, mixtral_mlp_forward, mixtral_model_forward
-        convert_forward(model,
-                        module.MixtralAttention,
-                        mixtral_attention_forward)
-        convert_forward(model,
-                        module.MixtralRMSNorm,
-                        rms_norm_forward)
-        convert_forward(model,
-                        module.MixtralSparseMoeBlock,
-                        mixtral_moeblock_forward)
-        convert_forward(model,
-                        module.MixtralBLockSparseTop2MLP,
-                        mixtral_mlp_forward)
-        convert_forward(model,
-                        module.MixtralModel,
-                        mixtral_model_forward)
     elif model.config.model_type == "phi-msft" and \
             hasattr(model.config, "num_local_experts"):
         # For phixtral, limit the condition to avoid applying on phi-2 hosted by ModelScope
@@ -1823,29 +1718,19 @@ def _optimize_post(model):
                         module.MLP,
                         phixtral_mlp_forward)
     elif model.config.model_type == "mistral":
-        if model.config.architectures is not None and \
-                model.config.architectures[0] == "MixtralForCausalLM":
-            # For DiscoResearch/mixtral-7b-8expert
-            invalidInputError(version.parse(trans_version) >= version.parse("4.36.0"),
-                              "Please upgrade transformers to 4.36.0 or higher version "
-                              "to run Mixtral models.")
-            modeling_module_name = model.__class__.__module__
-            module = importlib.import_module(modeling_module_name)
-            convert_forward(model, module.MistralRMSNorm, rms_norm_forward)
-        else:
-            modeling_module_name = model.__class__.__module__
-            module = importlib.import_module(modeling_module_name)
+        modeling_module_name = model.__class__.__module__
+        module = importlib.import_module(modeling_module_name)
-            from ipex_llm.transformers.models.mistral import mistral_model_forward
-            from ipex_llm.transformers.models.mistral import mistral_attention_forward
-            from ipex_llm.transformers.models.common import rms_norm_forward
-            from ipex_llm.transformers.models.common import mlp_silu_forward
+        from ipex_llm.transformers.models.mistral import mistral_model_forward
+        from ipex_llm.transformers.models.mistral import mistral_attention_forward
+        from ipex_llm.transformers.models.common import rms_norm_forward
+        from ipex_llm.transformers.models.common import mlp_silu_forward
-            convert_forward(model, module.MistralModel, mistral_model_forward)
-            convert_forward(model, module.MistralAttention, mistral_attention_forward)
-            convert_forward(model, module.MistralSdpaAttention, mistral_attention_forward)
-            convert_forward(model, module.MistralRMSNorm, rms_norm_forward)
-            convert_forward(model, module.MistralMLP, mlp_silu_forward)
+        convert_forward(model, module.MistralModel, mistral_model_forward)
+        convert_forward(model, module.MistralAttention, mistral_attention_forward)
+        convert_forward(model, module.MistralSdpaAttention, mistral_attention_forward)
+        convert_forward(model, module.MistralRMSNorm, rms_norm_forward)
+        convert_forward(model, module.MistralMLP, mlp_silu_forward)
     elif model.config.model_type == "gemma":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)

ipex_llm/transformers/lookup.py CHANGED Viewed

@@ -33,7 +33,7 @@ from ipex_llm.transformers.speculative import greedy, deepmind_sample, logits_to
     _crop_past_key_values, _prepare_generate_args, _non_cpu_ipex_verify, clear_benchmarks,\
     _prepare_generate_args_4_45
 from ipex_llm.utils.common import invalidInputError
-from ipex_llm.transformers.utils import get_xpu_device_type
+from ipex_llm.transformers.utils import get_xpu_device_name
 logger = logging.getLogger("ipex_llm.lookup")
@@ -295,7 +295,7 @@ def lookup_generate(self,
     invalidInputError(input_ids.shape[0] == 1,
                       "Prompt lookup is currently not supported with batch inference.")
-    device_name = get_xpu_device_type(input_ids)
+    device_name = get_xpu_device_name(input_ids.device)
     candidates_generator = PromptLookupCandidateGenerator(
         num_output_tokens=num_output_tokens,

ipex_llm/transformers/low_bit_linear.py CHANGED Viewed

@@ -51,7 +51,7 @@ from torch import Tensor, device, dtype, nn
 from operator import mul
 from functools import reduce
 from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
-from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_type, \
+from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_name, \
     get_ipex_version
 from ipex_llm.transformers.convert import is_deepspeed_available, get_use_vllm
@@ -266,7 +266,7 @@ def reshape_lm_head_input(x):
 def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
-    device = get_xpu_device_type(x)
+    device_name = get_xpu_device_name(x.device)
     batch_size = x.shape[0]
     hard_condition = (
         x.dtype in [torch.float, torch.half]
@@ -286,7 +286,7 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
             or (
                 qtype in [SYM_INT8, FP4, FP6, Q4_K, Q6_K]
                 and batch_size <= 48
-                and device in ["arc", "flex", "pvc", "mtl"]
+                and device_name in ["arc", "pvc", "mtl", "lnl", "arl"]
                 and x.shape[1] % 256 == 0
                 and output_len % 32 == 0
             )
@@ -295,8 +295,8 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int):
     if hard_condition:
         return (
             batch_size > 1
-            or (device in ["arc", "flex"] and qtype in [SYM_INT8, FP4])
-            or (device in ["arc", "flex", "mtl"] and qtype in [FP8E4])
+            or (device in ["arc"] and qtype in [SYM_INT8, FP4])
+            or (device in ["arc", "mtl"] and qtype in [FP8E4])
             or (device in ["lnl"] and qtype in [SYM_INT4] and x.shape[1] % 512 == 0)
             or (device in ["bmg"] and qtype in [SYM_INT4, FP8E5])
         )
@@ -603,7 +603,7 @@ class LowBitLinear(nn.Linear):
         # empty cache before and after lm_head at first token when input > 1024
         # on arc or IPEX_LLM_LOW_MEM is set to 1 at inference time.
         if self.device is None:
-            self.device = get_xpu_device_type(self.weight.data)
+            self.device = get_xpu_device_name(self.weight.data.device)
             self.low_memory_mode = \
                 self.low_memory_mode and \
                 (self.device == "arc" or os.environ.get("IPEX_LLM_LOW_MEM", None) == "1")
@@ -782,7 +782,7 @@ class FP16Linear(nn.Linear):
         if not self.use_esimd_kernel(x):
             if (
                 get_ipex_version() < "2.1.10+xpu"
-                or get_xpu_device_type(x) not in ["arc", "flex", "pvc"]
+                or get_xpu_device_name(x.device) not in ["arc", "pvc"]
                 or self.disable_fp16_opt
             ):
                 if self.weight_type == 2:
@@ -848,7 +848,7 @@ class FP16Linear(nn.Linear):
             return result.to(x.dtype)
     def use_esimd_kernel(self, x):
-        gpu_type = get_xpu_device_type(x)
+        gpu_type = get_xpu_device_name(x.device)
         if self.disable_fp16_opt:
             return False
         # esimd kernel can only be used for Arc and Flex

ipex_llm/transformers/models/chatglm2.py CHANGED Viewed

@@ -269,7 +269,7 @@ def chatglm2_attention_forward(
     # IPEX-LLM OPT: fuse rope
     inv_freq, position_ids = rotary_pos_emb
     rot_dim = inv_freq.size(-1) * 2
-    if should_use_fuse_rope(hidden_states, rotary_pos_emb[1], self.training):
+    if should_use_fuse_rope(hidden_states, position_ids, self.training):
         import xe_addons
         xe_addons.rotary_two_inplaced(inv_freq, position_ids,
                                       query_states[..., :rot_dim], key_states[..., :rot_dim])
@@ -321,197 +321,6 @@ def chatglm2_attention_forward(
     return output, past_key_value
-@torch.jit.script
-def apply_rotary_pos_emb_original(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
-    # x: [sq, b, np, hn]
-    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
-    rot_dim = rope_cache.shape[-2] * 2
-    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
-    # truncate to support variable sizes
-    rope_cache = rope_cache[:sq]
-    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
-    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
-    x_out2 = torch.stack(
-        [
-            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
-            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
-        ],
-        -1,
-    )
-    x_out2 = x_out2.flatten(3)
-    return torch.cat((x_out2, x_pass), dim=-1)
-def codegeex_model_forward(
-    self,
-    input_ids,
-    position_ids: Optional[torch.Tensor]=None,
-    attention_mask: Optional[torch.BoolTensor]=None,
-    full_attention_mask: Optional[torch.BoolTensor]=None,
-    past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]=None,
-    inputs_embeds: Optional[torch.Tensor]=None,
-    use_cache: Optional[bool]=None,
-    output_hidden_states: Optional[bool]=None,
-    return_dict: Optional[bool]=None,
-):
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-    if inputs_embeds is None:
-        batch_size, seq_length = input_ids.shape
-        inputs_embeds = self.embedding(input_ids)
-    else:
-        inputs_embeds = inputs_embeds.transpose(0, 1).contiguous()
-        seq_length, batch_size, _ = inputs_embeds.shape
-        input_ids = torch.empty((batch_size, seq_length),
-                                dtype=inputs_embeds.dtype, device=inputs_embeds.device)
-    if full_attention_mask is None:
-        if (attention_mask is not None and not attention_mask.all()) or (
-                past_key_values and seq_length != 1):
-            full_attention_mask = self.get_masks(input_ids,
-                                                 past_key_values,
-                                                 padding_mask=attention_mask)
-    # ipex-llm changes begin
-    # 1. replace `rotary_pos_emb` with `inv_freq` and `position_ids`
-    # 2. generate `causal_mask` and replace `full_attention_mask` with it
-    if position_ids is None:
-        if past_key_values is None:
-            position_ids = torch.arange(seq_length, dtype=torch.int64, device=inputs_embeds.device)
-        else:
-            if isinstance(past_key_values, DynamicCompressCache):
-                kv_length = past_key_values.get_seq_length()
-            else:
-                kv_length = past_key_values[0][0].size(0)
-            position_ids = torch.arange(kv_length, kv_length + seq_length,
-                                        dtype=torch.int64, device=inputs_embeds.device)
-        position_ids = position_ids.repeat(batch_size, 1)
-    use_fuse_rope = input_ids.device.type == "xpu" and not self.training
-    # Rotary positional embeddings
-    rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
-    if position_ids is not None:
-        rotary_pos_emb = rotary_pos_emb[position_ids]
-    else:
-        rotary_pos_emb = rotary_pos_emb[None, :seq_length]
-    if use_fuse_rope:
-        # Repeat cos sin here, call only once for each token.
-        # Chatglm2's rotary embedding is similar to gptj's, is rotate_every_two.
-        # If put this to attension forward, it will generate too many times.
-        cos, sin = rotary_pos_emb.split(rotary_pos_emb.shape[-1] // 2, dim=-1)
-        cos = cos.squeeze(-1)
-        sin = sin.squeeze(-1)
-        cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3)
-        sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3)
-        rotary_pos_emb = (cos, sin)
-    else:
-        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
-    # `full_attention_mask` is not None only when
-    #  `past_key_values` is not None and `seq_length` > 1
-    if full_attention_mask is not None:
-        causal_mask = torch.zeros([batch_size, 1, seq_length, full_attention_mask.size(-1)],
-                                  dtype=inputs_embeds.dtype, device=inputs_embeds.device)
-        mask_value = torch.finfo(inputs_embeds.dtype).min
-        causal_mask.masked_fill_(full_attention_mask, mask_value)
-    elif self.training or (inputs_embeds.device.type != "xpu" and past_key_values is None):
-        full_attention_mask = self.get_masks(input_ids,
-                                             past_key_values,
-                                             padding_mask=attention_mask)
-        causal_mask = torch.zeros([batch_size, 1, seq_length, full_attention_mask.size(-1)],
-                                  dtype=inputs_embeds.dtype, device=inputs_embeds.device)
-        mask_value = torch.finfo(inputs_embeds.dtype).min
-        causal_mask.masked_fill_(full_attention_mask, mask_value)
-    else:
-        causal_mask = None
-    # Run encoder.
-    hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
-        inputs_embeds, causal_mask,
-        rotary_pos_emb=rotary_pos_emb,
-        kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
-    )
-    # ipex-llm changes end
-    if not return_dict:
-        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions]
-                     if v is not None)
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=presents,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attentions,
-    )
-def codegeex_attention_forward(
-    self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
-):
-    q_len, bsz, _ = hidden_states.size()
-    n_head = self.num_attention_heads_per_partition
-    n_kv_head = self.num_multi_query_groups_per_partition if self.multi_query_attention else n_head
-    head_dim = self.hidden_size_per_attention_head
-    past_key_value = None if kv_cache is None else (kv_cache[0].permute(1, 2, 0, 3),
-                                                    kv_cache[1].permute(1, 2, 0, 3))
-    qkv = self.query_key_value(hidden_states)
-    qkv = qkv.view(q_len, bsz, n_head + 2 * n_kv_head, head_dim)
-    # [seq_len, bsz, n_head, head_dim] -> [bsz, n_head, seq_len, head_dim]
-    qkv = qkv.permute(1, 2, 0, 3)
-    query_layer, key_layer, value_layer = qkv.split([n_head,
-                                                     n_kv_head,
-                                                     n_kv_head], dim=1)
-    kv_seq_len = key_layer.shape[2]
-    if past_key_value is not None:
-        kv_seq_len += past_key_value[0].shape[2]
-    # apply relative positional encoding (rotary embedding)
-    if len(rotary_pos_emb) == 2 and isinstance(rotary_pos_emb, tuple):
-        cos, sin = rotary_pos_emb
-        rot_dim = cos.shape[-1]
-        query_layer = query_layer.transpose(1, 2)
-        key_layer = key_layer.transpose(1, 2)
-        query_layer_cur = query_layer[..., :rot_dim]
-        key_layer_cur = key_layer[..., :rot_dim]
-        # ipex_llm's apply_rotary_embedding can change the origin storage,
-        # so query_layer will get the result directly.
-        torch.ops.torch_ipex.apply_rotary_embedding(query_layer_cur, sin, cos, query_layer_cur)
-        torch.ops.torch_ipex.apply_rotary_embedding(key_layer_cur, sin, cos, key_layer_cur)
-        query_layer = query_layer.transpose(1, 2)
-        key_layer = key_layer.transpose(1, 2)
-    else:
-        query_layer = apply_rotary_pos_emb_original(query_layer, rotary_pos_emb)
-        key_layer = apply_rotary_pos_emb_original(key_layer, rotary_pos_emb)
-    key_layer, value_layer = update_past_key_value(
-        past_key_value, key_layer, value_layer,
-        kv_seq_len, False, hidden_states.device
-    )
-    # past_key_value: [bsz, n_kv_head, seq_len, head_dim] -> [seq_len, bsz, n_kv_head, head_dim]
-    past_key_value = (key_layer.permute(2, 0, 1, 3),
-                      value_layer.permute(2, 0, 1, 3)) if use_cache else None
-    # =================
-    # Output. [sq, b, h]
-    # =================
-    context_layer = scaled_dot_product_attention(
-        query_layer, key_layer, value_layer,
-        attention_mask, q_len == kv_seq_len
-    )
-    context_layer = context_layer.permute(2, 0, 1, 3).contiguous().view(q_len,
-                                                                        bsz,
-                                                                        n_head * head_dim)
-    output = self.dense(context_layer)
-    return output, past_key_value
 import torch.nn.functional as F

ipex_llm/transformers/models/minicpmv.py CHANGED Viewed

@@ -53,10 +53,10 @@ def siglip_attention_forward(
     qkv = qkv.transpose(1, 2)
     query_states, key_states, value_states = qkv.chunk(3, dim=1)
-    from ipex_llm.transformers.utils import get_xpu_device_type
+    from ipex_llm.transformers.utils import get_xpu_device_name
     if (
         self.head_dim == 72
-        and get_xpu_device_type(query_states) in ["arc", "flex"] and
+        and get_xpu_device_name(query_states.device) == "arc" and
         query_states.dtype in [torch.float, torch.half]
     ):
         n_heads, kv_length = query_states.size(1), key_states.size(2)

ipex_llm/transformers/models/sd.py CHANGED Viewed

@@ -36,7 +36,7 @@ import math
 import torch
 from typing import Optional
-from ipex_llm.transformers.utils import get_xpu_device_type
+from ipex_llm.transformers.utils import get_xpu_device_name
 from ipex_llm.transformers.models.common import padding_qkv_hd
 from ipex_llm.transformers.models.common import scaled_dot_product_attention
 from diffusers.models.attention_processor import Attention
@@ -144,7 +144,7 @@ class AttnProcessor2_0:
 def upcast_vae(self):
     # workaround overflow and ipex's bugs
-    if get_xpu_device_type(self.vae.post_quant_conv.weight) in ["arc", "flex", "pvc"]:
+    if get_xpu_device_name(self.vae.post_quant_conv.weight.device) == "arc":
         self.vae.to(torch.bfloat16)
     else:
         self.vae.decoder.up_blocks.to(torch.bfloat16)