PyPI - ipex-llm - Versions diffs - 2.2.0b20250105__py3-none-win_amd64.whl → 2.2.0b20250105.post0__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250105__py3-none-win_amd64.whl → 2.2.0b20250105.post0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/convert.py +17 -132
ipex_llm/transformers/lookup.py +2 -2
ipex_llm/transformers/low_bit_linear.py +8 -8
ipex_llm/transformers/models/chatglm2.py +1 -192
ipex_llm/transformers/models/minicpmv.py +2 -2
ipex_llm/transformers/models/sd.py +2 -2
ipex_llm/transformers/models/utils.py +14 -89
ipex_llm/transformers/npu_model.py +80 -50
ipex_llm/transformers/npu_models/convert_mp.py +1 -1
ipex_llm/transformers/npu_models/linear.py +15 -3
ipex_llm/transformers/npu_models/lm_head.py +1 -90
ipex_llm/transformers/npu_models/lm_head_linear.py +106 -0
ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +5 -8
ipex_llm/transformers/utils.py +5 -20
{ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/METADATA +40 -19
{ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/RECORD +51 -53
ipex_llm/transformers/models/cohere.py +0 -589
ipex_llm/transformers/models/falcon.py +0 -829
ipex_llm/transformers/models/mixtral.py +0 -576
{ipex_llm-2.2.0b20250105.data → ipex_llm-2.2.0b20250105.post0.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250105.data → ipex_llm-2.2.0b20250105.post0.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250105.data → ipex_llm-2.2.0b20250105.post0.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250105.dist-info → ipex_llm-2.2.0b20250105.post0.dist-info}/top_level.txt +0 -0

ipex_llm/transformers/models/utils.py CHANGED Viewed

@@ -19,7 +19,7 @@ import torch
 import warnings
 from ipex_llm.utils.common import invalidInputError
 from ipex_llm.ggml.quantize import ggml_tensor_qtype
-from ipex_llm.transformers.utils import get_ipex_version, get_xpu_device_type
+from ipex_llm.transformers.utils import get_ipex_version, get_xpu_device_name
 from ipex_llm.transformers.low_bit_linear import SYM_INT4, SYM_INT8, FP8E5, IQ2_XXS, FP4, FP8E4,\
     FP6, ASYM_INT4
@@ -85,16 +85,14 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: in
         return os.environ["IPEX_LLM_QUANTIZE_KV_CACHE"] == "1"
     elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
         return os.environ["IPEX_LLM_LOW_MEM"] == "1"
+    elif linear.qtype in [ggml_tensor_qtype["fp16"], ggml_tensor_qtype["bf16"]]:
+        return False
     else:
-        return x.device.type == 'xpu' and kv_cache_device_check(x, kv_group) \
-            and hasattr(linear, "qtype") and \
-            linear.qtype != ggml_tensor_qtype["fp16"] and linear.qtype != ggml_tensor_qtype["bf16"]
-def kv_cache_device_check(x: torch.Tensor, kv_group: int) -> bool:
-    return (get_xpu_device_type(x) in ["mtl", "lnl"] and kv_group <= 1) or \
-        ((get_xpu_device_type(x) == "arc" or get_xpu_device_type(x) == "flex") and
-            1 < x.size(0) and x.size(0) <= 8)
+        device_name = get_xpu_device_name(x.device)
+        return (
+            device_name in ["mtl", "lnl", "arl"] and kv_group == 1
+            or device_name in ["arc", "bmg"] and x.size(0) > 1
+        )
 def init_fp8_kv_cache(batch_size, num_heads, current_length, head_dim, device):
@@ -226,57 +224,6 @@ def is_enough_kv_cache_room_4_31(past_key_value, seq_len=1):
         (past_key_value[0].size(2) + seq_len) * past_key_value[0].size(3)
-def use_flash_attention(query, key, attention_mask=None):
-    # here we support query's shape is always [batch_size, head_num, q_len, head_dim],
-    # key's shape is always [batch_size, head_num, k_len, head_dim]
-    invalidInputError(query.dim() == 4,
-                      "Here query input of use_flash_attention should be [batch_size, "
-                      "head_num, q_len, head_dim]")
-    invalidInputError(key.dim() == 4,
-                      "Here key input of use_flash_attention should be [batch_size, "
-                      "head_num, k_len, head_dim]")
-    bsz, _, q_len, _ = query.size()
-    k_len = key.size()[2]
-    # check whether ipex flash attention can be used
-    if q_len != k_len:
-        # now only use flash attention for first token
-        # as it seems have no performance benifit for rest token now
-        return False
-    if query.device.type != "xpu":
-        # ipex flash attention only support for xpu
-        return False
-    ipex_version = get_ipex_version()
-    if ipex_version <= "2.0.110+xpu":
-        # ipex flash attention is supported from ipex 2.1
-        return False
-    if not torch.xpu.has_xetla():
-        # ipex flash attention is only supported for xetla
-        # may update this later
-        return False
-    elif get_xpu_device_type(query) != "pvc":
-        return False
-    if query.dtype not in [torch.float32, torch.float16]:
-        # only use flash attention for fp32/fp16 input
-        return False
-    if bsz > 1:
-        # as flash attention doesn't support attn_mask in ipex 2.1,
-        # so it will cause output error for padded batch input
-        if attention_mask is None:
-            return True
-        else:
-            # TODO: below logic may change for different model
-            # attention mask shape : [bsz, 1, q_len, k_len]
-            if attention_mask[0].squeeze()[0, 0].item() != 0:
-                # first batch contains padding
-                # otherwise we suppose it should be a upper triangular matrix
-                # at the same time, the diagonal is also 0
-                return False
-            elif not attention_mask.equal(attention_mask[0].repeat(bsz, 1, 1, 1)):
-                # check whether mask of every batch is the same
-                return False
-    return True
 def use_sdp(q_len, kv_len, head_dim, query_states):
     return (
         query_states.device.type == "xpu"
@@ -315,38 +262,16 @@ def mlp_fusion_check(x, qtype, training):
     if training or x.requires_grad:
         return False
     if qtype == FP6:
-        device = get_xpu_device_type(x)
-        if device in ["mtl", "lnl"]:
+        device = get_xpu_device_name(x.device)
+        if device in ["mtl", "lnl", "arl"]:
             return False
     return True
-def use_decoding_fast_path(proj,
-                           use_fuse_rope,
-                           enough_kv_room,
-                           bs,
-                           qtype_check=decoding_fast_path_qtype_check):
-    if proj is None:
-        return False
-    device = get_xpu_device_type(proj.weight)
-    if not qtype_check(proj):
-        return False
-    if not use_fuse_rope:
-        return False
-    if not enough_kv_room:
-        return False
-    if bs != 1:
-        return False
-    if device in ["uhd"]:
-        return False
-    return True
 def use_xmx(x: torch.Tensor, qtype: int):
-    device = get_xpu_device_type(x)
+    device = get_xpu_device_name(x.device)
     return (
-        device in ["arc", "flex", "pvc"]
+        device in ["arc", "pvc"]
         and qtype in [SYM_INT4, SYM_INT8, FP8E4, FP8E5]
         and (
             (device == "pvc" and 1 < x.size(0) <= 16)
@@ -370,7 +295,7 @@ def fp16_fusion_check(proj, x, training):
         return False
     if x.requires_grad:
         return False
-    device_type = get_xpu_device_type(x)
+    device_type = get_xpu_device_name(x.device)
     if device_type != "pvc":
         return False
     return True
@@ -439,7 +364,7 @@ def should_use_compresskv(x: torch.Tensor, prompt_len: int):
     else:
         if use_compress_kv is None:
             return (
-                get_xpu_device_type(x) in ["mtl", "lnl"]
+                get_xpu_device_name(x.device) in ["mtl", "lnl", "arl"]
                 and prompt_len >= 1800
                 and prompt_len <= 4500
             )

ipex_llm/transformers/npu_model.py CHANGED Viewed

@@ -27,7 +27,7 @@ from transformers.configuration_utils import PretrainedConfig
 from ipex_llm.utils.common.log4Error import invalidInputError
 from ipex_llm.transformers.utils import logger, load_imatrix_data
-from ipex_llm.transformers.npu_models.convert import optimize_llm, optimize_llm_post
+from ipex_llm.transformers.npu_models.convert import optimize_llm
 def patch_flash_attn_import(filename: str) -> List[str]:
@@ -207,8 +207,6 @@ class _BaseAutoModelClass:
             model = model.eval()
             logger.info(f"Finish to convert model")
         else:
-            from intel_npu_acceleration_library.compiler import create_npu_kernels
             if optimize_model:
                 invalidInputError(
                     max_prompt_len < max_context_len,
@@ -232,11 +230,14 @@ class _BaseAutoModelClass:
                     "convert_model": convert_model,
                     "save_directory": save_directory,
                     "fuse_layers": fuse_layers,
-                    "imatrix_data": imatrix_data
+                    "imatrix_data": imatrix_data,
+                    "skip_npu_logic": mock_device == "dummy",
                 }
+                # Dummy will skip npu related logic and save the quantized model
+                if mock_device == "dummy":
+                    model.save_low_bit = types.MethodType(save_low_bit, model)
                 model = cls.optimize_npu_model(*args, **optimize_kwargs)
             else:
-                from ipex_llm.transformers.npu_models.convert import optimize_llm
                 optimize_llm(model)
                 with torch.no_grad():
                     cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
@@ -258,7 +259,6 @@ class _BaseAutoModelClass:
     def optimize_npu_model(cls, *args, **kwargs):
         from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre, optimize_llm
-        from intel_npu_acceleration_library.compiler import create_npu_kernels
         model = kwargs.pop("model")
         qtype = kwargs.pop("qtype", "sym_int4_rtn")
@@ -275,6 +275,7 @@ class _BaseAutoModelClass:
         save_directory = kwargs.pop('save_directory', None)
         fuse_layers = kwargs.pop('fuse_layers', None)
         imatrix_data = kwargs.pop('imatrix_data', None)
+        skip_npu_logic = kwargs.pop("skip_npu_logic", False)
         invalidInputError(save_directory is not None,
                           "Please provide the path to save converted model "
                           "through `save_directory`.")
@@ -294,51 +295,58 @@ class _BaseAutoModelClass:
             cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
                              quantization_group_size, imatrix_data,
                              *args, **kwargs)
-            create_npu_kernels(llm)
+            if not skip_npu_logic:
+                from intel_npu_acceleration_library.compiler import create_npu_kernels
+                create_npu_kernels(llm)
         model = model.eval()
         logger.info(f"Finish to convert model")
         model.config.update({"bigdl_transformers_low_bit": qtype})
-        model.share_memory()
-        if not pipeline:
-            if model.config.model_type in ["qwen2", "llama", "minicpm"]:
-                from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
-                optimize_llm_single_process(
-                    llm,
-                    kv_len=max_context_len,
-                    max_prompt_len=max_prompt_len,
-                    transpose_value_cache=transpose_value_cache,
-                    group_size=quantization_group_size,
-                    qtype=qtype,
-                    save_directory=save_directory,
-                    fuse_layers=fuse_layers,
-                    has_llm=hasattr(model, "llm")
-                )
-            else:
-                optimize_llm(
-                    llm,
-                    max_context_len=max_context_len,
-                    max_prompt_len=max_prompt_len,
-                    inter_pp=inter_pp,
-                    intra_pp=intra_pp,
-                    transpose_value_cache=transpose_value_cache,
-                    group_size=quantization_group_size
-                )
+        if skip_npu_logic:
+            model.save_low_bit(model_dir=save_directory)
         else:
-            from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
-                import convert_llm
-            convert_llm(llm,
+            model.share_memory()
+            if not pipeline:
+                if model.config.model_type in ["qwen2", "llama", "minicpm"]:
+                    from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
+                    optimize_llm_single_process(
+                        llm,
                         kv_len=max_context_len,
                         max_prompt_len=max_prompt_len,
                         transpose_value_cache=transpose_value_cache,
                         group_size=quantization_group_size,
                         qtype=qtype,
-                        convert_model=convert_model,
                         save_directory=save_directory,
-                        fuse_layers=fuse_layers)
-        model.save_low_bit = types.MethodType(save_low_bit, model)
-        model.save_low_bit(save_directory)
-        logger.info(f"Converted model has already saved to {save_directory}.")
+                        fuse_layers=fuse_layers,
+                        has_llm=hasattr(model, "llm")
+                    )
+                else:
+                    optimize_llm(
+                        llm,
+                        max_context_len=max_context_len,
+                        max_prompt_len=max_prompt_len,
+                        inter_pp=inter_pp,
+                        intra_pp=intra_pp,
+                        transpose_value_cache=transpose_value_cache,
+                        group_size=quantization_group_size
+                    )
+            else:
+                from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
+                    import convert_llm
+                convert_llm(llm,
+                            kv_len=max_context_len,
+                            max_prompt_len=max_prompt_len,
+                            transpose_value_cache=transpose_value_cache,
+                            group_size=quantization_group_size,
+                            qtype=qtype,
+                            convert_model=convert_model,
+                            save_directory=save_directory,
+                            fuse_layers=fuse_layers)
+            model.save_low_bit = types.MethodType(save_low_bit, model)
+            model.save_low_bit(save_directory)
+            logger.info(f"Converted model has already saved to {save_directory}.")
         return model
     @classmethod
@@ -379,6 +387,7 @@ class _BaseAutoModelClass:
         intra_pp = kwargs.pop("intra_pp", None)
         transpose_value_cache = kwargs.pop("transpose_value_cache", True)
         modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
+        save_directory = kwargs.pop('save_directory', None)
         from transformers.models.auto.configuration_auto import AutoConfig
         from transformers.modeling_utils import no_init_weights, get_state_dict_dtype
@@ -650,16 +659,37 @@ class _BaseAutoModelClass:
             param.requires_grad_(False)
         if optimize_model and not pipeline:
-            from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
-            optimize_llm(
-                llm,
-                max_context_len=max_context_len,
-                max_prompt_len=max_prompt_len,
-                inter_pp=inter_pp,
-                intra_pp=intra_pp,
-                transpose_value_cache=transpose_value_cache,
-                group_size=quantization_group_size
-            )
+            if model.config.model_type in ["qwen2", "llama", "minicpm"]:
+                from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
+                if save_directory is None:
+                    invalidInputError(False,
+                                      "Please specify the save_directory, the path of folder " +
+                                      "to save the compiled NPU model. If path not exists, " +
+                                      "the compiled NPU model will be saved there. " +
+                                      "Else, program will exit.")
+                optimize_llm_single_process(
+                    llm,
+                    kv_len=max_context_len,
+                    max_prompt_len=max_prompt_len,
+                    transpose_value_cache=transpose_value_cache,
+                    group_size=quantization_group_size,
+                    qtype=qtype,
+                    save_directory=save_directory,
+                    fuse_layers=None,
+                    has_llm=hasattr(model, "llm")
+                )
+            else:
+                from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
+                optimize_llm(
+                    llm,
+                    max_context_len=max_context_len,
+                    max_prompt_len=max_prompt_len,
+                    inter_pp=inter_pp,
+                    intra_pp=intra_pp,
+                    transpose_value_cache=transpose_value_cache,
+                    group_size=quantization_group_size
+                )
         elif optimize_model and pipeline:
             from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
                 import convert_llm

ipex_llm/transformers/npu_models/convert_mp.py CHANGED Viewed

@@ -18,7 +18,7 @@ import torch
 import importlib
 import numpy as np
 from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params
-from ipex_llm.transformers.npu_models.lm_head import LMHeadLinear, SlicedLMHead
+from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
 from ipex_llm.utils.common.log4Error import invalidInputError

ipex_llm/transformers/npu_models/linear.py CHANGED Viewed

@@ -21,16 +21,25 @@
 # SPDX-License-Identifier: Apache 2.0
 #
-from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
-from intel_npu_acceleration_library.dtypes import NPUDtype
 import os
 import torch
 from torch.nn import Parameter
 import uuid
 import math
-from intel_npu_acceleration_library.backend import run_matmul
 from typing import Optional, Union
 from ipex_llm.utils.common import invalidInputError
+import importlib
+def is_acclib_available():
+    return importlib.util.find_spec("intel_npu_acceleration_library") is not None
+if is_acclib_available():
+    from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
+    from intel_npu_acceleration_library.dtypes import NPUDtype
+    from intel_npu_acceleration_library.backend import run_matmul
 class Linear(torch.nn.Module):
@@ -63,6 +72,7 @@ class Linear(torch.nn.Module):
         if self.training:
             out = self._mm(x, self.weight, None)
         else:
+            from intel_npu_acceleration_library.backend import run_matmul
             out = run_matmul(x, self.weight, None, self.op_id)
         if self.bias is None:
@@ -105,6 +115,8 @@ class Linear(torch.nn.Module):
         Returns:
             Union[Linear, QuantizedLinear]: A NPU linear layer
         """
+        from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
+        from intel_npu_acceleration_library.dtypes import NPUDtype
         if dtype.is_floating_point:
             if bias is None:
                 return Linear(weight.to(dtype), None)

ipex_llm/transformers/npu_models/lm_head.py CHANGED Viewed

@@ -16,96 +16,6 @@
 import torch
 from torch import nn
 import numpy as np
-from filelock import FileLock
-from intel_npu_acceleration_library.backend import NNFactory
-from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
-class LMHeadLinear(NNFactory):
-    """Quantized Linear class for sliced lm_head, computing a matrix matrix multiplication
-    with weights prefetching."""
-    def __init__(
-        self,
-        inC: int,
-        outC: int,
-        batch: int,
-        split_num: int = 2,
-        profile: bool = False,
-        device: str = "NPU",
-        dtype: np.dtype = np.int8,
-        use_split: bool = False,
-        group_size: int = 0,
-        asym: bool = False,
-    ):
-        """Initialize the LMHeadLinear class.
-        Args:
-            inC (int): input channels
-            outC (int): output channels
-            batch (int): batch
-            split_num (int): split in_features of lm_head to how many parts
-            profile (bool): Enable/Disable profiling. Defaults to False.
-            device (str): Target device, default to "NPU".
-            dtype (np.dtype): weights datatype. Defaults to np.int8.
-        """
-        super().__init__(profile, device)
-        self.inC, self.outC = inC, outC
-        self.batch = batch
-        self.split_num = split_num
-        if use_split:
-            input = self.parameter((1, self.batch, self.inC))
-            res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
-                                       scale_factor=(group_size == 0), asym=asym)
-        else:
-            input = self.parameter((self.batch, self.inC))
-            split_size = self.inC // split_num // 2 * 2
-            for i in range(self.split_num):
-                start_idx = i * split_size
-                end_idx = (i + 1) * split_size if i < self.split_num - 1 else self.inC
-                input_slice = self.slice(input, begin=[0, start_idx],
-                                         end=[self.batch, end_idx])
-                linear_slice = self.linear(input_slice, outC, split_size, bias=False,
-                                           wt_dtype=dtype, asym=asym)
-                if i == 0:
-                    res = linear_slice
-                else:
-                    res += linear_slice
-        print("start compiling lm_head")
-        self.compile()
-        print("end compiling lm_head")
-    def set_weights(self, op_id, weights):
-        self.set_weights_async(op_id, weights)
-        with FileLock(f"lmhead_run.lock"):
-            backend_lib.run(self._mm)
-    def set_weights_async(self, op_id, weights):
-        self.setWeights(1, op_id, *weights)
-    def run(
-        self, X: np.ndarray
-    ) -> np.ndarray:
-        """Run the layer:  $X * (W * S)^T$ .
-        Args:
-            X (np.ndarray): activation
-        Raises:
-            RuntimeError: Input, weights or scale shape mismatch
-        Returns:
-            np.ndarray: result
-        """
-        self.set_input_tensor(X, 0)
-        self.elapsed = backend_lib.run(self._mm)
-        if len(self.out) == 1:
-            return self.out[0]
-        return self.out
 class SlicedLMHead(nn.Module):
@@ -160,6 +70,7 @@ class SlicedLMHead(nn.Module):
         return self.lm_heads[0].weight.dtype
     def get_fused_lm_head(self):
+        from ipex_llm.transformers.npu_models.lm_head_linear import LMHeadLinear
         np_dtype = np.uint8 if self.get_weight_dtype() == torch.uint8 else np.int8
         self.fused_lm_head = LMHeadLinear(self.inC, self.outC, 1, self.split_num,
                                           False, "NPU", dtype=np_dtype, use_split=self.use_split,

ipex_llm/transformers/npu_models/lm_head_linear.py ADDED Viewed

@@ -0,0 +1,106 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from filelock import FileLock
+from intel_npu_acceleration_library.backend import NNFactory
+from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
+class LMHeadLinear(NNFactory):
+    """Quantized Linear class for sliced lm_head, computing a matrix matrix multiplication
+    with weights prefetching."""
+    def __init__(
+        self,
+        inC: int,
+        outC: int,
+        batch: int,
+        split_num: int = 2,
+        profile: bool = False,
+        device: str = "NPU",
+        dtype: np.dtype = np.int8,
+        use_split: bool = False,
+        group_size: int = 0,
+        asym: bool = False,
+    ):
+        """Initialize the LMHeadLinear class.
+        Args:
+            inC (int): input channels
+            outC (int): output channels
+            batch (int): batch
+            split_num (int): split in_features of lm_head to how many parts
+            profile (bool): Enable/Disable profiling. Defaults to False.
+            device (str): Target device, default to "NPU".
+            dtype (np.dtype): weights datatype. Defaults to np.int8.
+        """
+        super().__init__(profile, device)
+        self.inC, self.outC = inC, outC
+        self.batch = batch
+        self.split_num = split_num
+        if use_split:
+            input = self.parameter((1, self.batch, self.inC))
+            res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
+                                       scale_factor=(group_size == 0), asym=asym)
+        else:
+            input = self.parameter((self.batch, self.inC))
+            split_size = self.inC // split_num // 2 * 2
+            for i in range(self.split_num):
+                start_idx = i * split_size
+                end_idx = (i + 1) * split_size if i < self.split_num - 1 else self.inC
+                input_slice = self.slice(input, begin=[0, start_idx],
+                                         end=[self.batch, end_idx])
+                linear_slice = self.linear(input_slice, outC, split_size, bias=False,
+                                           wt_dtype=dtype, asym=asym)
+                if i == 0:
+                    res = linear_slice
+                else:
+                    res += linear_slice
+        print("start compiling lm_head")
+        self.compile()
+        print("end compiling lm_head")
+    def set_weights(self, op_id, weights):
+        self.set_weights_async(op_id, weights)
+        with FileLock(f"lmhead_run.lock"):
+            backend_lib.run(self._mm)
+    def set_weights_async(self, op_id, weights):
+        self.setWeights(1, op_id, *weights)
+    def run(
+        self, X: np.ndarray
+    ) -> np.ndarray:
+        """Run the layer:  $X * (W * S)^T$ .
+        Args:
+            X (np.ndarray): activation
+        Raises:
+            RuntimeError: Input, weights or scale shape mismatch
+        Returns:
+            np.ndarray: result
+        """
+        self.set_input_tensor(X, 0)
+        self.elapsed = backend_lib.run(self._mm)
+        if len(self.out) == 1:
+            return self.out[0]
+        return self.out

ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py CHANGED Viewed

@@ -473,10 +473,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "n_splits_linear": n_splits_linear,
                        "n_splits_down_proj": n_splits_down_proj,
                        "lm_head_low_bit": lm_head_low_bit}
-        model.config.update(update_dict)
-        model.config.save_pretrained(save_directory)
-        if model.can_generate():
-            model.generation_config.save_pretrained(save_directory)
         from .qwen import convert_qwen_layer, convert_fused_qwen_layer
         from .qwen import convert_lm_head_and_embedding
@@ -537,8 +533,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "n_splits_linear": n_splits_linear,
                        "n_splits_down_proj": n_splits_down_proj,
                        "lm_head_low_bit": lm_head_low_bit}
-        model.config.update(update_dict)
-        model.config.save_pretrained(save_directory)
         from .llama import convert_llama_layer, convert_fused_llama_layer
         from .llama import convert_lm_head_and_embedding
@@ -577,8 +571,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "n_splits_linear": n_splits_linear,
                        "n_splits_down_proj": n_splits_down_proj,
                        "lm_head_low_bit": lm_head_low_bit}
-        model.config.update(update_dict)
-        model.config.save_pretrained(save_directory)
         from .minicpm import convert_minicpm_layer, convert_fused_minicpm_layer
         from .minicpm import convert_lm_head_and_embedding
@@ -595,3 +587,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                                       save_directory, weight_dir,
                                       convert_model=True,
                                       max_prompt_len=max_prompt_len)
+    model.config.update(update_dict)
+    model.config.save_pretrained(save_directory)
+    if model.can_generate():
+        model.generation_config.save_pretrained(save_directory)