PyPI - ipex-llm - Versions diffs - 2.2.0b20250205__py3-none-win_amd64.whl → 2.2.0b20250206__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250205__py3-none-win_amd64.whl → 2.2.0b20250206__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/low_bit_linear.py +5 -4
ipex_llm/transformers/npu_model.py +17 -4
ipex_llm/transformers/npu_models/convert.py +6 -2
ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +27 -12
ipex_llm/transformers/npu_pipeline_model/llama.py +24 -11
ipex_llm/transformers/npu_pipeline_model/minicpm.py +19 -10
ipex_llm/transformers/npu_pipeline_model/qwen.py +17 -8
ipex_llm/transformers/qlora.py +2 -2
ipex_llm/transformers/utils.py +19 -6
ipex_llm/transformers/xpu_customize_fwd.py +6 -4
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250206.dist-info}/METADATA +20 -20
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250206.dist-info}/RECORD +47 -47
{ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250206.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250206.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250206.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250206.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250206.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250206.dist-info}/top_level.txt +0 -0

ipex_llm/libs/bloom-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/bloom.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama.dll CHANGED Viewed

Binary file

ipex_llm/libs/main-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/pipeline.dll CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/starcoder-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/starcoder.dll CHANGED Viewed

Binary file

ipex_llm/transformers/low_bit_linear.py CHANGED Viewed

@@ -51,7 +51,8 @@ from torch import Tensor, dtype, nn
 from operator import mul
 from functools import reduce
 from ipex_llm.transformers.xpu_customize_fwd import custom_fwd, custom_bwd
-from ipex_llm.transformers.utils import get_autocast_dtype, get_xpu_device_name
+from ipex_llm.transformers.utils import is_autocast_enabled, get_autocast_dtype
+from ipex_llm.transformers.utils import get_xpu_device_name
 from ipex_llm.transformers.convert import is_deepspeed_available, get_use_vllm
 T = TypeVar("T", bound="torch.nn.Module")
@@ -527,8 +528,8 @@ class MatMulLowBit(torch.autograd.Function):
         A, weight = ctx.tensors
         grad_A, grad_weight = None, None
         if req_gradA:
-            if torch.xpu.is_autocast_xpu_enabled():
-                grad_output = grad_output.to(torch.xpu.get_autocast_xpu_dtype())
+            if is_autocast_enabled("xpu"):
+                grad_output = grad_output.to(get_autocast_dtype("xpu"))
             if weight.qtype == NF4:
                 dequant_weight = xe_linear.dequant(A,
                                                    weight.data.view(torch.uint8),
@@ -615,7 +616,7 @@ class LowBitLinear(nn.Linear):
         is_training = self.training and not torch.is_inference_mode_enabled()
         if is_training:
             # below logic is only for training
-            autocast_dtype = get_autocast_dtype(x)
+            autocast_dtype = get_autocast_dtype(x.device.type)
             if self.compute_dtype is not None and x.device.type == "xpu":
                 x = x.to(self.compute_dtype)  # solve GC issue for unlora module
             elif autocast_dtype is not None:

ipex_llm/transformers/npu_model.py CHANGED Viewed

@@ -139,8 +139,10 @@ class _BaseAutoModelClass:
         mock_device = kwargs.pop('device', None)  # For mock on CPU
         convert_model = kwargs.pop('convert_model', False)
         save_directory = kwargs.pop('save_directory', None)
-        fuse_layers = kwargs.pop('fuse_layers', None)
-        imatrix_file = kwargs.pop('imatrix_file', None)
+        fuse_layers = kwargs.pop("fuse_layers", None)
+        imatrix_file = kwargs.pop("imatrix_file", None)
+        keep_ir = kwargs.pop("keep_ir", False)
+        compile_blob = kwargs.pop("compile_blob", True)
         if imatrix_file is not None:
             imatrix_data = load_imatrix_data(imatrix_file)
@@ -236,6 +238,8 @@ class _BaseAutoModelClass:
                     "fuse_layers": fuse_layers,
                     "imatrix_data": imatrix_data,
                     "skip_npu_logic": mock_device == "dummy",
+                    "keep_ir": keep_ir,
+                    "compile_blob": compile_blob,
                 }
                 # Dummy will skip npu related logic and save the quantized model
                 if mock_device == "dummy":
@@ -280,9 +284,14 @@ class _BaseAutoModelClass:
         fuse_layers = kwargs.pop('fuse_layers', None)
         imatrix_data = kwargs.pop('imatrix_data', None)
         skip_npu_logic = kwargs.pop("skip_npu_logic", False)
+        keep_ir = kwargs.pop("keep_ir", False)
+        compile_blob = kwargs.pop("compile_blob", True)
         invalidInputError(save_directory is not None,
                           "Please provide the path to save converted model "
                           "through `save_directory`.")
+        invalidInputError(keep_ir or compile_blob,
+                          "Please save blob or save IR either.")
         if hasattr(model, "llm"):
             llm = model.llm
@@ -323,7 +332,9 @@ class _BaseAutoModelClass:
                         qtype=qtype,
                         save_directory=save_directory,
                         fuse_layers=fuse_layers,
-                        has_llm=hasattr(model, "llm")
+                        has_llm=hasattr(model, "llm"),
+                        keep_ir=keep_ir,
+                        compile_blob=compile_blob
                     )
                 else:
                     optimize_llm(
@@ -346,7 +357,9 @@ class _BaseAutoModelClass:
                             qtype=qtype,
                             convert_model=convert_model,
                             save_directory=save_directory,
-                            fuse_layers=fuse_layers)
+                            fuse_layers=fuse_layers,
+                            keep_ir=keep_ir,
+                            compile_blob=compile_blob)
             model.save_low_bit = types.MethodType(save_low_bit, model)
             model.save_low_bit(save_directory)
             logger.info(f"Converted model has already saved to {save_directory}.")

ipex_llm/transformers/npu_models/convert.py CHANGED Viewed

@@ -450,7 +450,9 @@ def optimize_llm_single_process(
     qtype: str,
     save_directory: str,
     fuse_layers: int=None,
-    has_llm: bool=False
+    has_llm: bool=False,
+    keep_ir: bool=False,
+    compile_blob: bool=True
 ):
     from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm
     from .npu_llm_cpp import load_model_from_file
@@ -463,7 +465,9 @@ def optimize_llm_single_process(
                 qtype=qtype,
                 convert_model=True,
                 save_directory=save_directory,
-                fuse_layers=fuse_layers)
+                fuse_layers=fuse_layers,
+                keep_ir=keep_ir,
+                compile_blob=compile_blob)
     try:
         model_ptr = load_model_from_file(save_directory)
         model.kv_len = kv_len

ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py CHANGED Viewed

@@ -196,7 +196,9 @@ def convert_llm(model: torch.nn.Module,
                 qtype: str,
                 convert_model: bool=False,
                 save_directory: str=None,
-                fuse_layers: int=None):
+                fuse_layers: int=None,
+                keep_ir: bool=False,
+                compile_blob: bool=True):
     # whether to set layernorm weight as const
     layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
     if group_size == 0:
@@ -220,7 +222,9 @@ def convert_llm(model: torch.nn.Module,
                                n_splits_down_proj,
                                group_size,
                                save_directory,
-                               fuse_layers=fuse_layers)
+                               fuse_layers=fuse_layers,
+                               keep_ir=keep_ir,
+                               compile_blob=compile_blob)
         return 0
     if model.config.model_type == "llama":
         with tempfile.TemporaryDirectory() as temp_dir:
@@ -428,7 +432,9 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                            n_splits_down_proj: int,
                            group_size: int,
                            save_directory: str=None,
-                           fuse_layers: int=None):
+                           fuse_layers: int=None,
+                           keep_ir: bool=False,
+                           compile_blob: bool=True):
     if not os.path.exists(save_directory):
         os.mkdir(save_directory)
     weight_dir = os.path.join(save_directory, "model_weights")
@@ -479,14 +485,17 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         # save fused_layers blobs of fused decoder layers
         convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                                  save_directory, weight_dir, transpose_value_cache, kv_len,
-                                 group_size, layernorm_const, "decode")
+                                 group_size, layernorm_const, "decode",
+                                 keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of single prefill layer
         convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
                            save_directory, weight_dir, transpose_value_cache, max_prompt_len,
-                           group_size, layernorm_const, "prefill")
+                           group_size, layernorm_const, "prefill",
+                           keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of lmhead and bin of embedding
         convert_lm_head_and_embedding(model, save_directory, weight_dir,
-                                      convert_model=True, group_size=group_size)
+                                      convert_model=True, group_size=group_size,
+                                      keep_ir=keep_ir, compile_blob=compile_blob)
     elif model.config.model_type == "llama":
         embedding_post = False
         cos_sin_input = False
@@ -540,15 +549,18 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         convert_lm_head_and_embedding(model, n_splits_linear,
                                       save_directory, weight_dir,
                                       convert_model=True,
-                                      max_prompt_len=max_prompt_len)
+                                      max_prompt_len=max_prompt_len,
+                                      keep_ir=keep_ir, compile_blob=compile_blob)
         # save fused_layers blobs of fused decoder layers
         convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                                   save_directory, weight_dir, transpose_value_cache, kv_len,
-                                  group_size, layernorm_const, "decode")
+                                  group_size, layernorm_const, "decode",
+                                  keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of single prefill layer
         convert_llama_layer(model, 0, n_splits_linear, n_splits_down_proj,
                             save_directory, weight_dir, transpose_value_cache, max_prompt_len,
-                            group_size, layernorm_const, "prefill")
+                            group_size, layernorm_const, "prefill",
+                            keep_ir=keep_ir, compile_blob=compile_blob)
     elif model.config.model_type == "minicpm":
         if group_size == 0:
             fused_layers = 4 if fuse_layers is None else fuse_layers
@@ -577,16 +589,19 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         # save fused_layers blobs of fused decoder layers
         convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                                     save_directory, weight_dir, transpose_value_cache, kv_len,
-                                    group_size, layernorm_const, "decode")
+                                    group_size, layernorm_const, "decode",
+                                    keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of single prefill layer
         convert_minicpm_layer(model, 0, n_splits_linear, n_splits_down_proj,
                               save_directory, weight_dir, transpose_value_cache, max_prompt_len,
-                              group_size, layernorm_const, "prefill")
+                              group_size, layernorm_const, "prefill",
+                              keep_ir=keep_ir, compile_blob=compile_blob)
         # save blob of lmhead and bin of embedding and embedding_post
         convert_lm_head_and_embedding(model, n_splits_linear,
                                       save_directory, weight_dir,
                                       convert_model=True,
-                                      max_prompt_len=max_prompt_len)
+                                      max_prompt_len=max_prompt_len,
+                                      keep_ir=keep_ir, compile_blob=compile_blob)
     model.config.update(update_dict)
     model.config.save_pretrained(save_directory)

ipex_llm/transformers/npu_pipeline_model/llama.py CHANGED Viewed

@@ -123,7 +123,8 @@ class Llama32PostEmbedding(NNFactory):
 def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
-                                  convert_model=False, max_prompt_len=1):
+                                  convert_model=False, max_prompt_len=1,
+                                  keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -175,7 +176,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
         asym=asym
     )
     last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
-                                                        True, False)
+                                                        keep_ir=keep_ir, compile_blob=compile_blob)
+    os.remove(os.path.join(temp_dir, "lm_head.bin"))
     # save weights bins files
     if n_splits_linear == 1:
@@ -211,7 +213,9 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
             first_blob_path = None
         else:
             first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
-                                                                 temp_dir, True, False)
+                                                                 temp_dir, keep_ir=keep_ir,
+                                                                 compile_blob=compile_blob)
+            os.remove(os.path.join(temp_dir, "embedding.bin"))
     else:
         # llama-3.2-3B & llama-3.2-1B
         embedding_layer = model.model.embed_tokens
@@ -235,22 +239,28 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
                                                   attention_scaling=attention_scaling,
                                                   input_len=1)
             update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
-                                               temp_dir, True, False)
+                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
             embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
                                                           attention_scaling=attention_scaling,
                                                           input_len=max_prompt_len)
             update_names_of_IR_and_export_blob(embedding_post_prefill,
                                                "embedding_post_prefill",
-                                               temp_dir, True, False)
+                                               temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
+            os.remove(os.path.join(temp_dir, "embedding_post.bin"))
+            os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
         else:
             first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
-                                                                 temp_dir)
+                                                                 temp_dir, keep_ir=keep_ir,
+                                                                 compile_blob=compile_blob)
+            os.remove(os.path.join(temp_dir, "embedding.bin"))
     return first_blob_path, last_blob_path
 def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                         temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                        layernorm_const, mode="decode"):
+                        layernorm_const, mode="decode",
+                        keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -317,8 +327,9 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
                                                         decoder_name,
                                                         temp_dir,
-                                                        True, False,
+                                                        keep_ir=keep_ir, compile_blob=compile_blob,
                                                         npu_dpu_groups=npu_dpu_groups)
+    os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
     if mode == "decode":
         if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
@@ -364,7 +375,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                               save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                              layernorm_const, mode="decode"):
+                              layernorm_const, mode="decode",
+                              keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -457,6 +469,7 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow
         update_names_of_IR_and_export_blob(fused_decoder,
                                            f"decoder_layer_{i}",
                                            save_dir,
-                                           compile_blob=True,
-                                           keep_ir=False)
+                                           keep_ir=keep_ir,
+                                           compile_blob=compile_blob)
+        os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
     return 0

ipex_llm/transformers/npu_pipeline_model/minicpm.py CHANGED Viewed

@@ -162,7 +162,8 @@ class MiniCPMLMHead(LLMBaseNNFactory):
 def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
-                                  convert_model=False, max_prompt_len=1):
+                                  convert_model=False, max_prompt_len=1,
+                                  keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -230,7 +231,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
         asym=asym
     )
     last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
-                                                        True, True)
+                                                        keep_ir=keep_ir, compile_blob=compile_blob)
+    os.remove(os.path.join(temp_dir, "lm_head.bin"))
     # save weights bins files
     if n_splits_linear == 1:
@@ -280,22 +282,27 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
                                               dtype=np.float16,
                                               scale_emb=model.config.scale_emb)
         update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
-                                           temp_dir, True, False)
+                                           temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
         embedding_post_prefill = MiniCPMPostEmbedding(max_prompt_len, model.config.hidden_size,
                                                       dtype=np.float16,
                                                       scale_emb=model.config.scale_emb)
         update_names_of_IR_and_export_blob(embedding_post_prefill,
                                            "embedding_post_prefill",
-                                           temp_dir, True, False)
+                                           temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
+        os.remove(os.path.join(temp_dir, "embedding_post.bin"))
+        os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
     else:
         first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
-                                                             temp_dir, True, False)
+                                                             temp_dir, keep_ir=keep_ir,
+                                                             compile_blob=compile_blob)
+        os.remove(os.path.join(temp_dir, "embedding.bin"))
     return first_blob_path, last_blob_path
 def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                           temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                          layernorm_const, mode="decode"):
+                          layernorm_const, mode="decode",
+                          keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -353,7 +360,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
                                                         decoder_name,
                                                         temp_dir,
-                                                        True, True)
+                                                        keep_ir=keep_ir, compile_blob=compile_blob)
+    os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
     if mode == "decode":
         if layernorm_const:
@@ -386,7 +394,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                                 save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                                layernorm_const, mode="decode"):
+                                layernorm_const, mode="decode",
+                                keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -477,6 +486,6 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d
         update_names_of_IR_and_export_blob(fused_decoder,
                                            f"decoder_layer_{i}",
                                            save_dir,
-                                           compile_blob=True,
-                                           keep_ir=False)
+                                           keep_ir=keep_ir, compile_blob=compile_blob)
+        os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
     return 0

ipex_llm/transformers/npu_pipeline_model/qwen.py CHANGED Viewed

@@ -24,7 +24,8 @@ from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
 def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
-                                  convert_model=False, group_size=0):
+                                  convert_model=False, group_size=0,
+                                  keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     head_dim = model.model.layers[0].self_attn.head_dim
     rms_norm_eps = model.config.rms_norm_eps
@@ -84,7 +85,9 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
     )
     last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
-                                                        temp_dir, True, False)
+                                                        temp_dir, keep_ir=keep_ir,
+                                                        compile_blob=compile_blob)
+    os.remove(os.path.join(temp_dir, "lm_head.bin"))
     # save weights bins files
     if not isinstance(lm_head, SlicedLMHead):
@@ -119,13 +122,16 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
         first_blob_path = True
     else:
         first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
-                                                             temp_dir, True, keep_ir=True)
+                                                             temp_dir, keep_ir=keep_ir,
+                                                             compile_blob=compile_blob)
+        os.remove(os.path.join(temp_dir, "embedding.bin"))
     return first_blob_path, last_blob_path
 def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                        temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                       layernorm_const, mode="decode"):
+                       layernorm_const, mode="decode",
+                       keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -183,8 +189,10 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     )
     rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
                                                         decoder_name,
-                                                        temp_dir, True, False,
+                                                        temp_dir, keep_ir=keep_ir,
+                                                        compile_blob=compile_blob,
                                                         npu_dpu_groups=npu_dpu_groups)
+    os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
     # 0, 1, 2 are input_embed/attention_mask/position_id
     if mode == "decode":
@@ -226,7 +234,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
 def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
                              save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                             layernorm_const, mode="decode"):
+                             layernorm_const, mode="decode",
+                             keep_ir=False, compile_blob=True):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -330,6 +339,6 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
         update_names_of_IR_and_export_blob(fused_decoder,
                                            f"decoder_layer_{i}",
                                            save_dir,
-                                           compile_blob=True,
-                                           keep_ir=False)
+                                           keep_ir=keep_ir, compile_blob=compile_blob)
+        os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
     return 0

ipex_llm/transformers/qlora.py CHANGED Viewed

@@ -109,7 +109,7 @@ class LoraLowBitLinear(Module, LoraLayer):
             self.qa_pool = torch.nn.Identity()
     def forward(self, x: torch.Tensor):
-        autocast_dtype = get_autocast_dtype(x)
+        autocast_dtype = get_autocast_dtype(x.device.type)
         if x.device.type == "xpu":
             # force to use bf16 on gpu
             x = x.to(torch.bfloat16)
@@ -177,7 +177,7 @@ class LoraBF16Linear(Module, LoraLayer):
         self.is_target_conv_1d_layer = is_target_conv_1d_layer
     def forward(self, x: torch.Tensor):
-        autocast_dtype = get_autocast_dtype(x)
+        autocast_dtype = get_autocast_dtype(x.device.type)
         if x.device.type == "xpu":
             # force to use bf16 on gpu
             x = x.to(torch.bfloat16)

ipex_llm/transformers/utils.py CHANGED Viewed

@@ -138,26 +138,39 @@ def fix_key(key):
     return key
-def get_autocast_dtype(x):
+def is_autocast_enabled(device_type: str):
     if torch.__version__ >= '2.3':
-        if torch.is_autocast_enabled(x.device.type):
-            return torch.get_autocast_dtype(x.device.type)
+        return torch.is_autocast_enabled(device_type)
+    else:
+        if device_type == "xpu":
+            return torch.xpu.is_autocast_xpu_enabled()
+        elif device_type == "cpu":
+            return torch.is_autocast_cpu_enabled()
+        else:
+            invalidInputError(False,
+                              f"Device type {device_type} is not supported.")
+def get_autocast_dtype(device_type: str):
+    if torch.__version__ >= '2.3':
+        if torch.is_autocast_enabled(device_type):
+            return torch.get_autocast_dtype(device_type)
         else:
             return None
     else:
-        if x.device.type == "xpu":
+        if device_type == "xpu":
             if torch.xpu.is_autocast_xpu_enabled():
                 return torch.xpu.get_autocast_xpu_dtype()
             else:
                 return None
-        elif x.device.type == "cpu":
+        elif device_type == "cpu":
             if torch.is_autocast_cpu_enabled():
                 return torch.get_autocast_cpu_dtype()
             else:
                 return None
         else:
             invalidInputError(False,
-                              f"Device {x.device} is not supported.")
+                              f"Device type {device_type} is not supported.")
 def get_xpu_device_name(device: torch.device):

ipex_llm/transformers/xpu_customize_fwd.py CHANGED Viewed

@@ -107,6 +107,8 @@ except ModuleNotFoundError:
     np = None  # type: ignore[assignment]
 from typing import Any
+from ipex_llm.transformers.utils import is_autocast_enabled, get_autocast_dtype
 def _cast(value, dtype):
     if isinstance(value, torch.Tensor):
@@ -155,12 +157,12 @@ def custom_fwd(fwd=None, *, cast_inputs=None):
     @functools.wraps(fwd)
     def decorate_fwd(*args, **kwargs):
-        args[0]._dtype = torch.xpu.get_autocast_xpu_dtype()
+        args[0]._dtype = get_autocast_dtype("xpu")
         if cast_inputs is None:
-            args[0]._fwd_used_autocast = torch.xpu.is_autocast_xpu_enabled()
+            args[0]._fwd_used_autocast = is_autocast_enabled("xpu")
             return fwd(*args, **kwargs)
         else:
-            autocast_context = torch.xpu.is_autocast_xpu_enabled()
+            autocast_context = is_autocast_enabled("xpu")
             args[0]._fwd_used_autocast = False
             if autocast_context:
                 with torch.xpu.autocast(enabled=False):
@@ -184,7 +186,7 @@ def custom_bwd(bwd):
     @functools.wraps(bwd)
     def decorate_bwd(*args, **kwargs):
-        with torch.xpu.autocast(enabled=args[0]._fwd_used_autocast, dtype=args[0]._dtype):
+        with torch.autocast("xpu", enabled=args[0]._fwd_used_autocast, dtype=args[0]._dtype):
             return bwd(*args, **kwargs)
     return decorate_bwd

{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250206.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ipex-llm
-Version: 2.2.0b20250205
+Version: 2.2.0b20250206
 Summary: Large Language Model Develop Toolkit
 Home-page: https://github.com/intel-analytics/ipex-llm
 Author: BigDL Authors
@@ -27,10 +27,10 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
 Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
 Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
 Provides-Extra: cpp
-Requires-Dist: bigdl-core-cpp ==2.6.0b20250205 ; extra == 'cpp'
+Requires-Dist: bigdl-core-cpp ==2.6.0b20250206 ; extra == 'cpp'
 Requires-Dist: setuptools ; extra == 'cpp'
 Provides-Extra: cpp-arl
-Requires-Dist: bigdl-core-cpp ==2.6.0b20250205 ; extra == 'cpp-arl'
+Requires-Dist: bigdl-core-cpp ==2.6.0b20250206 ; extra == 'cpp-arl'
 Requires-Dist: setuptools ; extra == 'cpp-arl'
 Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
 Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
@@ -67,7 +67,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
 Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
 Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
-Requires-Dist: bigdl-core-npu ==2.6.0b20250205 ; (platform_system == "Windows") and extra == 'npu'
+Requires-Dist: bigdl-core-npu ==2.6.0b20250206 ; (platform_system == "Windows") and extra == 'npu'
 Provides-Extra: serving
 Requires-Dist: py-cpuinfo ; extra == 'serving'
 Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -87,9 +87,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
 Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
 Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
 Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250205 ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250205 ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250205 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250206 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250206 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250206 ; extra == 'xpu'
 Provides-Extra: xpu-2-1
 Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
 Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -104,9 +104,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
 Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
 Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
 Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250205 ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250205 ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250205 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250206 ; extra == 'xpu-2-1'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
 Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
 Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -124,7 +124,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
 Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
 Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
 Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
-Requires-Dist: bigdl-core-xe-all ==2.6.0b20250205 ; extra == 'xpu-2-6'
+Requires-Dist: bigdl-core-xe-all ==2.6.0b20250206 ; extra == 'xpu-2-6'
 Requires-Dist: onednn-devel ==2025.0.1 ; extra == 'xpu-2-6'
 Requires-Dist: onednn ==2025.0.1 ; extra == 'xpu-2-6'
 Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; extra == 'xpu-2-6'
@@ -140,9 +140,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
 Requires-Dist: tabulate ; extra == 'xpu-arc'
 Requires-Dist: setuptools ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250205 ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250205 ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250205 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-arc'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -163,9 +163,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
 Requires-Dist: tabulate ; extra == 'xpu-arl'
 Requires-Dist: setuptools ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250205 ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250205 ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250205 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-arl'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -186,9 +186,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
 Requires-Dist: tabulate ; extra == 'xpu-lnl'
 Requires-Dist: setuptools ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250205 ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250205 ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250205 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250206 ; extra == 'xpu-lnl'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'

{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250206.dist-info}/RECORD RENAMED Viewed

@@ -41,35 +41,35 @@ ipex_llm/langchain/llms/transformerspipelinellm.py,sha256=vm522YPPwWxxAPVvQBtxRf
 ipex_llm/langchain/vllm/__init__.py,sha256=T-EbRT6GJ_8RCu-iLmSzcftOimXSPQf2d5X72AUAy2Y,874
 ipex_llm/langchain/vllm/vllm.py,sha256=6dxc-ZISZQrJilEa_HA827l75Dv9rcHpY_G6FdJ8BVs,7793
 ipex_llm/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ipex_llm/libs/bloom-api.dll,sha256=TYlavqKZLm9cn6bP0_Or4CwB3Hjz8Yx4pp8S97-AHBs,36352
-ipex_llm/libs/bloom.dll,sha256=wSBD5w8Jerz1w_YZcXgXkuo2Zvb2MAxwfP93Kf07vkU,507904
-ipex_llm/libs/gptneox-api.dll,sha256=faHoG2Cjjw_FGzdzB_yLSJaJkbOr6xh7YVNnF9_Qu0o,24576
-ipex_llm/libs/gptneox.dll,sha256=0vyvMBJ4C0BWzdDiLxVLLAam4E8CSgNKCSnZFM_To6s,568320
-ipex_llm/libs/libbloom_avx.dll,sha256=5lA3evSyudxU5-i5-SfShwIxCAVL0ssHnwLUKxI-vDI,536576
-ipex_llm/libs/libbloom_vnni.dll,sha256=sI5rtSX7dBFVWzJieXKvtlHyDaLXD4cWmhZSTidAgvI,508416
-ipex_llm/libs/libgptneox_avx.dll,sha256=B21Em9BYrn1HTz2O2B5FTY02w0-p9qpevh5N7X6S4QU,596992
-ipex_llm/libs/libgptneox_vnni.dll,sha256=cq3bT4fI_m_bKh67LxQ6lclPX02z2EpYVKAmrmL4_zs,568832
-ipex_llm/libs/libllama_avx.dll,sha256=KlyAxUy8bWTOtyFfzKGJ_AM3JkCYTHMi9fhKNtT_uUU,591360
-ipex_llm/libs/libllama_vnni.dll,sha256=75HWr7FpY3Xs-QtPq5gVkcvJ57KdjK5kTIdQk6xSA1E,563200
-ipex_llm/libs/libstarcoder_avx.dll,sha256=sjyGRSsjjc_GcyZpvB89UhighhQ9wU80zldZsglRjwA,627712
-ipex_llm/libs/libstarcoder_vnni.dll,sha256=F97jMsjqaIjnePj60vxqApcUrKp3-SZxXz4COBGUzvM,599552
-ipex_llm/libs/llama-api.dll,sha256=OnpoPUxtof1EROi67pOWUuxSy6PriboSZalzEiUlCg8,25600
-ipex_llm/libs/llama.dll,sha256=K2aQszJnyEO-JSxwYrxrFmcZV07P1QmTRfRAvXwM2Z8,562688
-ipex_llm/libs/main-bloom.exe,sha256=BGIcVdDx4iqv1RyoeUy84fxmKyD0GqE4LqlVMSMcQg8,103424
-ipex_llm/libs/main-gptneox.exe,sha256=qPpFcf6SJBH-RENZKMUdne3CrXWqfuhuriKmdvmmIso,98816
-ipex_llm/libs/main-llama.exe,sha256=4BO76bHr0rfxHd_aMOsEwrHJWZaOmYnPplmjbx2yxtA,99840
-ipex_llm/libs/main-starcoder.exe,sha256=eEw_9oi2MN1b3Pqz07-ZWj2iT23_B8E7_kpZXNf1E80,157696
-ipex_llm/libs/pipeline.dll,sha256=Zgvo_nVUIniV1LIFPTTOomYivrqVBKteML0Ho8uzuwA,72704
-ipex_llm/libs/quantize-bloom.exe,sha256=xTuVYryKBNIvU-GPakVyx_B1fkST7AgG1al6db1NPrI,126464
-ipex_llm/libs/quantize-bloom_vnni.exe,sha256=s0oXCglgy-t9unR0LUDBPlafr4kuQSHbwg3-1RdekFY,128000
-ipex_llm/libs/quantize-gptneox.exe,sha256=HZHwFQTn_GvK-t-BRNogifb5YsfqaUtTsJSYA2OPP2M,104448
-ipex_llm/libs/quantize-gptneox_vnni.exe,sha256=uBZd6QtfKaoTuwdtr_dnP_8gq_a-3aFbXpS_hT-_okc,104960
-ipex_llm/libs/quantize-llama.exe,sha256=E37KzQH13mrrtw42I1k_SUI9ineOnuarayVLc-PLCMs,110080
-ipex_llm/libs/quantize-llama_vnni.exe,sha256=E8xUhe9Z-YHHbIS4bv9q_6KaJyHLZ0wIavtdNKoBpjQ,110592
-ipex_llm/libs/quantize-starcoder.exe,sha256=zM_szw58gFqVlG-asnXct8c0acWYkrfUegmx0HxHB0w,127488
-ipex_llm/libs/quantize-starcoder_vnni.exe,sha256=ZlbP1fHeLMMBNrZr-ZMa142ctlDWK5eO6i3yE92Xy8I,128512
-ipex_llm/libs/starcoder-api.dll,sha256=UkGT5bPDpDOjdeD3wugMOrTt8cY8dEyUHa7_noZ6M2M,21504
-ipex_llm/libs/starcoder.dll,sha256=n3aruSfoeRcYLia9OREio5bvI2y9JLhhZaoPmjscVw8,599040
+ipex_llm/libs/bloom-api.dll,sha256=H0S3QMH9mK_VlsEGqqM7vGKNiuvD1j3_cNOloDEqojg,36352
+ipex_llm/libs/bloom.dll,sha256=GtBKdhbPz4gZDtzdcrjiIa0IoZighOQmWoaScXlCzGA,507904
+ipex_llm/libs/gptneox-api.dll,sha256=_8nji5kq5Z524SGHaElsEFZCkCZJRyjLAbK7dF5EAkE,24576
+ipex_llm/libs/gptneox.dll,sha256=su29UwirxxACBTb9rKx4ln5sKsmmG82J7wbIFv9GOQs,568320
+ipex_llm/libs/libbloom_avx.dll,sha256=v1lgo7B-JJMWEwgs5hDwkm9XSd0nmO1r1X8JoYaJLIs,536576
+ipex_llm/libs/libbloom_vnni.dll,sha256=dBbTV7wWKZKPMw4oZL-H2_ooLdBhwziXLI97xLKvC3w,508416
+ipex_llm/libs/libgptneox_avx.dll,sha256=F_JBremk85c6zqKo0_rse9YXo9v_T52jFwy8Nnwt9yg,596992
+ipex_llm/libs/libgptneox_vnni.dll,sha256=8A6hc2w5Xqq2MoY_t-El6upUqFuI5Cu-ITiiDv9Nfvg,568832
+ipex_llm/libs/libllama_avx.dll,sha256=fADeqa8IK5akM04Cjyd1IRY3Exk8tAuIdNzKBew2zJg,591360
+ipex_llm/libs/libllama_vnni.dll,sha256=SbwkJLCQqtIW9zz_QKzAYb5kqfyUSs8-gddMikbB57s,563200
+ipex_llm/libs/libstarcoder_avx.dll,sha256=vgvvBkIZ18ofJ9rE69gkNn9SpY025RyI7x2VM0APDWA,627712
+ipex_llm/libs/libstarcoder_vnni.dll,sha256=L0cdtY2qHvKpJhFEPl_UkaCVhUw4tcknoIuWbyxQ-ck,599552
+ipex_llm/libs/llama-api.dll,sha256=7yQHdnnFcNiHESH3nrGLyEWscKV9FTPWmDqk-Gf9bA8,25600
+ipex_llm/libs/llama.dll,sha256=pOUGsXP8_NP1byv7z_Q-JU2flWnTjYlCL6lbU-RvORw,562688
+ipex_llm/libs/main-bloom.exe,sha256=bK5DfBLbt4jHwdPl0hw1zaBGQHFWC9MFjiDRqCXFgFA,103424
+ipex_llm/libs/main-gptneox.exe,sha256=3OfGBYDzOpYeB6GxToauh8af4M8i6l4Z6ffYQPdKyIw,98816
+ipex_llm/libs/main-llama.exe,sha256=wZGa8lG3bfaEQi8-DvRC4D3sjMKXms1pwT9OXVME4_Y,99840
+ipex_llm/libs/main-starcoder.exe,sha256=3yZrYUpJ1FYOWCh6PNmWagQ5e6BmimlL25B6AiPmQys,157696
+ipex_llm/libs/pipeline.dll,sha256=uDPNVk7J_dvOX_NTAJs6AEtm5pAnwYLuczHYuTV6Pso,72704
+ipex_llm/libs/quantize-bloom.exe,sha256=6Rl2TEE9-FN0jHrcAYsZjfp0kAxzMoHKuvM31d8pzPs,126464
+ipex_llm/libs/quantize-bloom_vnni.exe,sha256=7Q20DE84l-CDxcVgUxzWspAh0faioQw2iJqdtk9JME4,128000
+ipex_llm/libs/quantize-gptneox.exe,sha256=QRxEqJYH3ShD6KLhW3guxM_SxPusFADvv8j5euhp53Q,104448
+ipex_llm/libs/quantize-gptneox_vnni.exe,sha256=fj2E8ChFakQQzHHpYb_UxNy-9yQ9ZbChhr5PYUCdWkw,104960
+ipex_llm/libs/quantize-llama.exe,sha256=v2Rq663-92bMV3ze9l2-ocxvSjTeqlJegfY5XLf4MRQ,110080
+ipex_llm/libs/quantize-llama_vnni.exe,sha256=ywCgkuUA_jBImNslFpLFdcUvGv2pcbRvRZyZBhJ6-4c,110592
+ipex_llm/libs/quantize-starcoder.exe,sha256=zbiQpagpWGSYqgyHEmHgglDen3nDUS1LyhUXJbt65wE,127488
+ipex_llm/libs/quantize-starcoder_vnni.exe,sha256=xtynT6qbnZ1nBRxsIQbi2JGSKOlvdSCCozDQJiDSwCg,128512
+ipex_llm/libs/starcoder-api.dll,sha256=7c7MWBv57ZhfiynSrYJIKhnE6HMXUTSYYTUGwD7BX9I,21504
+ipex_llm/libs/starcoder.dll,sha256=AzASEAh2HCDC9XIQ0JfUiUDqF-3p4KR3rF71MKQDA4k,599040
 ipex_llm/llamaindex/__init__.py,sha256=T-EbRT6GJ_8RCu-iLmSzcftOimXSPQf2d5X72AUAy2Y,874
 ipex_llm/llamaindex/llms/__init__.py,sha256=KP1lEdGqDuxPoxL1ZSH25Pm2kKMPJBWUTLR0ckSLMIU,1139
 ipex_llm/llamaindex/llms/bigdlllm.py,sha256=FQBzq1KOjfc6uofTXAha3O7TqpJkNfOFepXQmOVlbnI,26314
@@ -94,19 +94,19 @@ ipex_llm/transformers/kv.py,sha256=k4TU18LlA-Sbq9WNNQnfuzu3RSFBwFhmaV3BcGN5bAo,1
 ipex_llm/transformers/lisa.py,sha256=F5WxbtXQ7RdKulj83h_2DnEIgKiKGZf7zvOmg6QBl2s,3289
 ipex_llm/transformers/loader.py,sha256=AwjV5RpI2t2bedlv7ZhLm8cfd-QJZm5hny-XyjIvdnk,6876
 ipex_llm/transformers/lookup.py,sha256=b6OlZ9OV10R9qeWw8mVryVpDxszkjwLkldvi7GPMJY8,19614
-ipex_llm/transformers/low_bit_linear.py,sha256=9Z3qQc8j5Kkq4vqrY4gqz8_b2V8hFIhvRm-vdIT8N_4,39123
+ipex_llm/transformers/low_bit_linear.py,sha256=3EtbiCAq5HU_r2pGJ9beSDK4NPTN8Jj-aHMqm1jqX18,39177
 ipex_llm/transformers/model.py,sha256=cQJNlAkdfoWmVbWd-TS2hf-Do41mMO9orPvG3FO4Nns,40855
 ipex_llm/transformers/modelling_bigdl.py,sha256=7JpNVMuyq_OmtNUaMFMXdxPWZp2q0QHC02QeA-VTPOw,6709
-ipex_llm/transformers/npu_model.py,sha256=X8ZtvZJpzz64XrSPhUYXXZmdJcbZ9X6G3Vlzw-zgN1Q,39749
+ipex_llm/transformers/npu_model.py,sha256=LMmRmhq8IAN9FrXLUeUK2B8XS2OJ5GVWmG0cEdeK-ro,40354
 ipex_llm/transformers/patches.py,sha256=G9KcXxo42H1HJEDaroq4JbBN5P0P0lty7U7kk7-g4tw,991
 ipex_llm/transformers/pipeline_parallel.py,sha256=uNZpOXljNmdoEYnP8U-VFiN4dRZb2piQbIf2bG9LQnE,49051
-ipex_llm/transformers/qlora.py,sha256=jtPGsvWFjbTUGzDBCdfftnCis_0nJQNRpACSwXUbbGU,14943
+ipex_llm/transformers/qlora.py,sha256=qV9Y6G5kAaet77LLA3oXn3qQY4ayyAPZ7NAjOlHCS7g,14967
 ipex_llm/transformers/relora.py,sha256=-dYzUV0P-IhO2jFdnzN9-v_sFzJpRj3ZwN9eCJzOoCw,16567
 ipex_llm/transformers/speculative.py,sha256=0XNLgc9dGswJHVPrXo4iM7pPxkWwfFfJMECcivJSnIc,63368
 ipex_llm/transformers/streamer.py,sha256=RrVlLblzCOtABRUpaMXAyaMnCGgLUtAi_YesLumRbww,4842
 ipex_llm/transformers/training_patch.py,sha256=oxMkUtqyvqJiprw6dE3skkYfD1HOmUlH9N0hBkbn0G0,10799
-ipex_llm/transformers/utils.py,sha256=JBekwpPD-CyMxt1OzvVsp7tu26pSA4v2mjuaUbqrAgI,16995
-ipex_llm/transformers/xpu_customize_fwd.py,sha256=wFpIhs5F6tkNs8gBOrLxWdhLzO3EDHovVkERPIAoAvg,7611
+ipex_llm/transformers/utils.py,sha256=a-2wbflSd_yYnC5qcMoY5HLR1yT_QpxeX_WpGpaDLrA,17457
+ipex_llm/transformers/xpu_customize_fwd.py,sha256=PUBYLnTbaBXUs3Dnte9Gqln2XFk8iA62SmloWjr7GJI,7668
 ipex_llm/transformers/xpu_ops.py,sha256=z95iTtcDQvNyJOvB4A6B_ECTYjHp4A7x-FsssoETOMs,4914
 ipex_llm/transformers/awq/__init__.py,sha256=Du5gu3-eeAkeDO_dEMBTzrDBA66DSN3uL3-rn8WGXQw,875
 ipex_llm/transformers/awq/act.py,sha256=YwomJzOOKwkKtzGrm4L4kwBstBLO1Z8SK4CKi8PSYVQ,2172
@@ -183,7 +183,7 @@ ipex_llm/transformers/npu_models/baichuan_mp.py,sha256=tHhO-0v5z6IhxsfzAPYWXVbLr
 ipex_llm/transformers/npu_models/chatglm.py,sha256=YzpGLZ7ORt6qkwW9mCwZ_xhOAI8uHSDHJrmqWgNM234,10511
 ipex_llm/transformers/npu_models/chatglm4.py,sha256=J4523DzhIzZxIvlf1V9qU4auzEGKvC80YqyxuCJygjw,9795
 ipex_llm/transformers/npu_models/common.py,sha256=tTUJL7IxVrJSnXle6nla35wTUrBf2sOEt7Ya1qyMezY,4853
-ipex_llm/transformers/npu_models/convert.py,sha256=FILSGnoltcR9FMrCkw0eOKh6p3sbBI5i0Ms8AsJc04E,25342
+ipex_llm/transformers/npu_models/convert.py,sha256=2YAi8rvEYu_tvzpczKsJBsKjAns5FAPz1MntJTxIQC0,25472
 ipex_llm/transformers/npu_models/convert_mp.py,sha256=Y6Fcde7bXHkZ0wvm8PymxJqvncbDj3ZjMez3SY9qi5U,24452
 ipex_llm/transformers/npu_models/glm_edge.py,sha256=VsJex-6530h4ZQk35TxRe1MnttAHT41omE8LV47LgBE,6723
 ipex_llm/transformers/npu_models/kv.py,sha256=2OSFO9Z6e4nGdVxXEM-Bq2qa_npYYbGmQt3lcCZxTlU,9201
@@ -208,11 +208,11 @@ ipex_llm/transformers/npu_models/xlm_mp.py,sha256=sj8OVun8xJprM7ZJp0XzWa55rqlSIz
 ipex_llm/transformers/npu_pipeline_model/__init__.py,sha256=b2IXvVqQ5cItki021h8s3ymW12RPu8QNPprq4Mn3bDM,586
 ipex_llm/transformers/npu_pipeline_model/baichuan.py,sha256=ICxRzFQ4OIANDkkVi2_4xOeQXmfFXYMx3H52KuE1xR4,6208
 ipex_llm/transformers/npu_pipeline_model/common.py,sha256=QxJoJESpv0BpwO_FBeAT2wKA56wNFfen8iI37PrMKuA,7838
-ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256=wrRgmNT13RVtQRp5gFRBxNEPJHxFMLeGqb8a58YodPQ,28512
-ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=MnvHRytLt3oy5jIPUBe8AeEJ6PtPWLbhQ5a9WqjZ1TQ,19905
-ipex_llm/transformers/npu_pipeline_model/minicpm.py,sha256=MDMesYlVbECKdK0xxkt1LwHgpkJOO7ZwBExYAwMGQa0,20637
+ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py,sha256=-eHNbRuX2QhYd0-jCyo2pZpHTZTZ108bhObYx8a3CJs,29494
+ipex_llm/transformers/npu_pipeline_model/llama.py,sha256=pmAnawfAn0W8XSr8kGWfxR1HylCLa-Y6mKpFeX-m8UY,20892
+ipex_llm/transformers/npu_pipeline_model/minicpm.py,sha256=H7j_UaHj-IwEBriQ-bunle0-8s2NmvqnL9eYuixnmFc,21398
 ipex_llm/transformers/npu_pipeline_model/pipeline_cpp.py,sha256=JNmodAMg_NQvDILug3E_fGXEh6cd3wsj4bvAzcd-vaU,2749
-ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=3paMXr1viuztybhmVLqQ9XvM3EZbxncDuNSNwLF8OI0,14849
+ipex_llm/transformers/npu_pipeline_model/qwen.py,sha256=FAfoPlKEAxeU6-J8ltpSev5ithm9AC-urtreu6NGpME,15509
 ipex_llm/utils/__init__.py,sha256=LlUgrD03rfw4iY8zWPtHH6p65Gw76waVOLHaqagETw0,1425
 ipex_llm/utils/benchmark_util_4_29.py,sha256=OU1W1quiaiJGsg1pd3HM9O6PmVSaPA0HHE7R8hNTfmQ,258653
 ipex_llm/utils/benchmark_util_4_42.py,sha256=HEiClCgKDp_T64HH8ulSTly8dvt6UwPDYZfrPVYvXcc,225383
@@ -248,11 +248,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
 ipex_llm/vllm/xpu/engine/engine.py,sha256=k4-D27WS_Gk3mA--w3HWAjPjb4Aiu043MVPi0ZoAUBc,5984
 ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=GshTZFB8e4PWvqckfbmTOU6b0oLkNn7A-vzLuG9--j8,21544
 ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=2rENA2ucynMaIjiZBEh2ez1o5vR32GaP514t39CD7KM,8676
-ipex_llm-2.2.0b20250205.data/scripts/ipex-llm-init.bat,sha256=HPtCYuDYwEatq7dAwOvdfVcHYCpAVdbj75K1qh0vQek,2578
-ipex_llm-2.2.0b20250205.data/scripts/llm-chat.ps1,sha256=6qrs-hGVAV8IKh7Jx8nq_XrnZcjd7qGU5wndArM7Yag,2769
-ipex_llm-2.2.0b20250205.data/scripts/llm-cli.ps1,sha256=3qBtTLs_EjYDnM8YyCpJhzLnGCKTEGssu9UNqfkjVXs,3009
-ipex_llm-2.2.0b20250205.dist-info/METADATA,sha256=dcy4swE9fq7tfGndfg4yAhMDaIs4-0nrRpcDFuAaMFw,12879
-ipex_llm-2.2.0b20250205.dist-info/WHEEL,sha256=6iYPr8vTHsyDK75jr9X0V3I9wPSVmtwr_8fdATBciGk,98
-ipex_llm-2.2.0b20250205.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
-ipex_llm-2.2.0b20250205.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
-ipex_llm-2.2.0b20250205.dist-info/RECORD,,
+ipex_llm-2.2.0b20250206.data/scripts/ipex-llm-init.bat,sha256=HPtCYuDYwEatq7dAwOvdfVcHYCpAVdbj75K1qh0vQek,2578
+ipex_llm-2.2.0b20250206.data/scripts/llm-chat.ps1,sha256=6qrs-hGVAV8IKh7Jx8nq_XrnZcjd7qGU5wndArM7Yag,2769
+ipex_llm-2.2.0b20250206.data/scripts/llm-cli.ps1,sha256=3qBtTLs_EjYDnM8YyCpJhzLnGCKTEGssu9UNqfkjVXs,3009
+ipex_llm-2.2.0b20250206.dist-info/METADATA,sha256=pAr_-dBEJB_J2lV8oNgJkJ5bGTObiseNHISkXAGkY9I,12879
+ipex_llm-2.2.0b20250206.dist-info/WHEEL,sha256=6iYPr8vTHsyDK75jr9X0V3I9wPSVmtwr_8fdATBciGk,98
+ipex_llm-2.2.0b20250206.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
+ipex_llm-2.2.0b20250206.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
+ipex_llm-2.2.0b20250206.dist-info/RECORD,,

{ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250206.data}/scripts/ipex-llm-init.bat RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250206.data}/scripts/llm-chat.ps1 RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250206.data}/scripts/llm-cli.ps1 RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250206.dist-info}/WHEEL RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250206.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250206.dist-info}/top_level.txt RENAMED Viewed

File without changes