PyPI - ipex-llm - Versions diffs - 2.2.0b20250108__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250109__py3-none-manylinux2010_x86_64.whl - Mend

ipex-llm 2.2.0b20250108__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250109__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

ipex_llm/transformers/convert.py CHANGED Viewed

@@ -1325,7 +1325,6 @@ def _optimize_post(model):
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
             from ipex_llm.transformers.models.chatglm2 import chatglm2_attention_forward
-            from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
             from ipex_llm.transformers.models.chatglm2 import chatglm2_encoder_forward
             from ipex_llm.transformers.models.chatglm2 import chatglm2_model_forward
             from ipex_llm.transformers.models.chatglm2 import mlp_forward
@@ -1338,9 +1337,7 @@ def _optimize_post(model):
             convert_forward(model,
                             module.ChatGLMModel,
                             chatglm2_model_forward)
-            convert_forward(model,
-                            module.RMSNorm,
-                            chatglm_rms_norm_forward)
+            convert_forward(model, module.RMSNorm, rms_norm_forward)
             convert_forward(model, module.MLP, mlp_forward)
             # for codegeex-nano
             if hasattr(model.config, "rope_ratio"):
@@ -1358,8 +1355,7 @@ def _optimize_post(model):
             # glm4 family
             modeling_module_name = model.__class__.__module__
             module = importlib.import_module(modeling_module_name)
-            from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
-            convert_forward(model, module.RMSNorm, chatglm_rms_norm_forward)
+            convert_forward(model, module.RMSNorm, rms_norm_forward)
             if hasattr(model.transformer, "vision"):
                 # glm4 vision family
@@ -1448,8 +1444,8 @@ def _optimize_post(model):
     elif model.config.model_type == "baichuan":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
-        from ipex_llm.transformers.models.baichuan import baichuan_mlp_forward
-        convert_forward(model, module.MLP, baichuan_mlp_forward)
+        convert_forward(model, module.RMSNorm, rms_norm_forward)
+        convert_forward(model, module.MLP, mlp_silu_forward)
         if model.config.hidden_size in [4096, 2048]:
             # baichuan-7B and baichuan2-7B
@@ -1458,7 +1454,6 @@ def _optimize_post(model):
             for i in range(len(model.model.layers)):
                 setattr(model.model.layers[i].self_attn, "layer_idx", i)
             convert_forward(model, module.Attention, baichuan_attention_forward_7b)
-            convert_forward(model, module.RMSNorm, rms_norm_forward)
             if model.config.vocab_size == 125696:
                 # baichuan2-7B
                 convert_forward(model, module.BaichuanModel, baichuan_model_7b_forward)
@@ -1468,9 +1463,7 @@ def _optimize_post(model):
         elif model.config.hidden_size == 5120:
             # baichuan-13B and baichuan2-13B
             from ipex_llm.transformers.models.baichuan import baichuan_attention_forward_13b
-            from ipex_llm.transformers.models.baichuan import baichuan_13b_rms_norm_forward
             convert_forward(model, module.BaichuanAttention, baichuan_attention_forward_13b)
-            convert_forward(model, module.RMSNorm, baichuan_13b_rms_norm_forward)
             if model.config.vocab_size == 125696:
                 # baichaun2-13B
@@ -1565,7 +1558,6 @@ def _optimize_post(model):
             from ipex_llm.transformers.models.qwen import qwen_attention_forward
             from ipex_llm.transformers.models.qwen import qwen_attention_forward_registered
             from ipex_llm.transformers.models.qwen import qwen_mlp_forward
-            from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward
             from ipex_llm.transformers.models.qwen import qwen_model_forward
             if model.config.max_position_embeddings == 8192 \
                and model.config.hidden_size == 4096:
@@ -1580,7 +1572,7 @@ def _optimize_post(model):
                                 )
             convert_forward(model,
                             module.RMSNorm,
-                            chatglm_rms_norm_forward)
+                            rms_norm_forward)
             convert_forward(model,
                             module.QWenMLP,
                             qwen_mlp_forward)

ipex_llm/transformers/models/baichuan.py CHANGED Viewed

@@ -47,38 +47,6 @@ def pre_compute_inv_freq(module: torch.nn.Module):
         module.register_buffer("inv_freq", inv_freq, persistent=False)
-def baichuan_13b_rms_norm_forward(self, hidden_states):
-    if hidden_states.device.type == "xpu" and not (self.training or hidden_states.requires_grad):
-        import xe_addons
-        x_2d = hidden_states.reshape(-1, hidden_states.size(-1)).contiguous()
-        output = xe_addons.rms_norm(self.weight, x_2d, self.epsilon)
-        return output.reshape(hidden_states.shape)
-    input_dtype = hidden_states.dtype
-    hidden_states = hidden_states.to(torch.float32)
-    variance = hidden_states.pow(2).mean(-1, keepdim=True)
-    hidden_states = hidden_states * torch.rsqrt(variance + self.epsilon)
-    return self.weight * hidden_states.to(input_dtype)
-def baichuan_mlp_forward(
-    self,
-    x: torch.Tensor,
-) -> torch.Tensor:
-    x_2d = x.view(-1, x.shape[-1])
-    qtype = getattr(self.gate_proj, "qtype", None)
-    if mlp_fusion_check(x_2d, qtype, self.training):
-        import xe_linear
-        if not x_2d.is_contiguous():
-            x_2d = x_2d.contiguous()
-        return self.down_proj(xe_linear.mlp_forward_xpu(
-            x_2d, self.gate_proj.weight.data, self.up_proj.weight.data,
-            x_2d.shape[0], x_2d.shape[1], self.gate_proj.out_len,
-            SILU, qtype
-        ))
-    return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 def baichuan_model_7b_forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -105,7 +73,9 @@ def baichuan_model_7b_forward(
     if use_cache:
         inputs = input_ids if input_ids is not None else inputs_embeds
         use_compress_kv = should_use_compresskv(inputs, inputs.shape[1])
-        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs)
+        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
+                                                self.config.num_attention_heads,
+                                                self.config.num_attention_heads)
         if use_compress_kv and not isinstance(past_key_values,
                                               DynamicCompressCache):
             if use_quantize_kv:
@@ -278,8 +248,6 @@ def baichuan_attention_forward_7b(
         key_states = key_states.to(hidden_states.dtype)
     # IPEX-LLM OPT: kv cache and quantize kv
-    use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states)
     # [CompressKV]
     if use_compresskv:
         enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value,
@@ -290,6 +258,8 @@ def baichuan_attention_forward_7b(
             query_states, attention_mask, 1,
             self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH)
     else:
+        use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states,
+                                                self.num_heads, self.num_heads)
         key_states, value_states = update_past_key_value(
             past_key_value, key_states, value_states,
             kv_seq_len, use_quantize_kv, device
@@ -340,7 +310,8 @@ def baichuan_attention_forward_13b(
         kv_seq_len += past_key_value[0].shape[2]
     # IPEX-LLM OPT: kv cache and quantize kv
-    use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states,
+                                            self.num_heads, self.num_heads)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, device

ipex_llm/transformers/models/bert.py CHANGED Viewed

@@ -36,24 +36,13 @@ import math
 import torch
 from typing import Optional, Tuple
 from transformers.models.bert.modeling_bert import BertSelfAttention, BertEncoder
+from ipex_llm.transformers.models.common import merge_linear
 from ipex_llm.utils.common import invalidInputError
 def merge_qkv(module: torch.nn.Module):
     if isinstance(module, BertSelfAttention):
-        q_w = module.query.weight.data
-        k_w = module.key.weight.data
-        v_w = module.value.weight.data
-        q_b = module.query.bias.data
-        k_b = module.key.bias.data
-        v_b = module.value.bias.data
-        new_w = torch.cat([q_w, k_w, v_w], dim=0)
-        new_b = torch.cat([q_b, k_b, v_b], dim=-1)
-        qkv = torch.nn.Linear(0, 0, bias=True)
-        qkv.weight = torch.nn.Parameter(new_w, requires_grad=False)
-        qkv.bias = torch.nn.Parameter(new_b, requires_grad=False)
-        qkv.in_features = module.query.in_features
-        qkv.out_features = module.query.out_features * 3
+        qkv = merge_linear([module.query, module.key, module.value])
         module.qkv = qkv
         del module.query
         del module.key

ipex_llm/transformers/models/chatglm2.py CHANGED Viewed

@@ -33,34 +33,6 @@ from ipex_llm.transformers.kv import DynamicCompressCache, DynamicCompressFp8Cac
 KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256))
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states
-    go from (batch, num_key_value_heads, seqlen, head_dim) to
-    (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads,
-                                                           n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-def chatglm_rms_norm_forward(self, hidden_states):
-    if hidden_states.device.type == "xpu" and not (self.training and hidden_states.requires_grad):
-        import xe_addons
-        x_2d = hidden_states.reshape(-1, hidden_states.size(-1)).contiguous()
-        output = xe_addons.rms_norm(self.weight, x_2d, self.eps)
-        return output.reshape(hidden_states.shape)
-    input_dtype = hidden_states.dtype
-    hidden_states = hidden_states.to(torch.float32)
-    variance = hidden_states.pow(2).mean(-1, keepdim=True)
-    hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-    return self.weight * hidden_states.to(input_dtype)
 def chatglm2_model_forward(
     self,
     input_ids,
@@ -91,8 +63,13 @@ def chatglm2_model_forward(
     if use_cache:
         use_compress_kv = should_use_compresskv(input_ids, input_ids.shape[1])
+        n_heads = self.config.num_attention_heads
+        if self.config.multi_query_attention:
+            n_kv_heads = self.config.multi_query_group_num
+        else:
+            n_kv_heads = n_heads
         use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.gate_proj,
-                                                input_ids)
+                                                input_ids, n_heads, n_kv_heads)
         if use_compress_kv and not isinstance(past_key_values,
                                               DynamicCompressCache):
             if use_quantize_kv:
@@ -285,8 +262,6 @@ def chatglm2_attention_forward(
         key_states[..., :rot_dim] = k_rot[...]
     # IPEX-LLM OPT: kv cache and quantize kv
-    use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states)
     # [CompressKV]
     if use_compresskv:
         from transformers.configuration_utils import PretrainedConfig
@@ -300,6 +275,8 @@ def chatglm2_attention_forward(
             self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH
         )
     else:
+        use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states,
+                                                n_head, n_kv_head)
         key_states, value_states = update_past_key_value(
             past_key_value, key_states, value_states,
             kv_seq_len, use_quantize_kv, hidden_states.device

ipex_llm/transformers/models/chatglm4.py CHANGED Viewed

@@ -55,8 +55,13 @@ def chatglm4_model_forward(
     if use_cache:
         inputs = input_ids if input_ids is not None else inputs_embeds
         use_compress_kv = should_use_compresskv(inputs, inputs.shape[1])
-        use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.gate_proj,
-                                                inputs)
+        n_heads = self.config.num_attention_heads
+        if self.config.multi_query_attention:
+            n_kv_heads = self.config.multi_query_group_num
+        else:
+            n_kv_heads = n_heads
+        use_quantize_kv = use_quantize_kv_cache(self.encoder.layers[0].mlp.gate_proj, inputs,
+                                                n_heads, n_kv_heads)
         if use_compress_kv and not isinstance(past_key_values,
                                               DynamicCompressCache):
             if use_quantize_kv:
@@ -211,8 +216,6 @@ def chatglm4_attention_forward(
         key_states[..., :rot_dim] = k_rot[...]
     # IPEX-LLM OPT: kv cache and quantize kv
-    use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states)
     # [CompressKV]
     if use_compresskv:
         from transformers.configuration_utils import PretrainedConfig
@@ -226,6 +229,8 @@ def chatglm4_attention_forward(
             self.config, enough_kv_room, KV_CACHE_ALLOC_BLOCK_LENGTH
         )
     else:
+        use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states,
+                                                n_head, n_kv_head)
         key_states, value_states = update_past_key_value(
             past_key_value, key_states, value_states,
             kv_seq_len, use_quantize_kv, hidden_states.device

ipex_llm/transformers/models/chatglm4v.py CHANGED Viewed

@@ -230,7 +230,7 @@ def chatglm4v_attention_forward(
         key_states[..., :rot_dim] = k_rot[...]
     # IPEX-LLM OPT: kv cache and quantize kv
-    use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states)
+    use_quantize_kv = use_quantize_kv_cache(self.query_key_value, query_states, n_head, n_kv_head)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, hidden_states.device

ipex_llm/transformers/models/common.py CHANGED Viewed

@@ -157,8 +157,10 @@ def rms_norm_forward(self, hidden_states: torch.Tensor):
     weight = self.weight
     if hasattr(self, "variance_epsilon"):
         eps = self.variance_epsilon
-    else:
+    elif hasattr(self, "epsilon"):
         eps = self.epsilon
+    else:
+        eps = self.eps
     if hidden_states.device.type == 'xpu' and hidden_states.dtype in [torch.float, torch.half]:
         import xe_addons

ipex_llm/transformers/models/glm.py CHANGED Viewed

@@ -147,7 +147,7 @@ def glm_model_forward_wrapper(origin_forward):
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         use_cache = use_cache or inputs.device.type == 'xpu'
         use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
-                                                self.config.num_attention_heads //
+                                                self.config.num_attention_heads,
                                                 self.config.num_key_value_heads)
         if use_cache:

ipex_llm/transformers/models/internlm.py CHANGED Viewed

@@ -87,7 +87,8 @@ def internlm_attention_forward(
         )
     # IPEX-LLM OPT: kv cache and quantzie kv cache
-    use_quantize_kv = use_quantize_kv_cache(self.qkv_proj, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.qkv_proj, hidden_states,
+                                            self.num_heads, self.num_heads)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, hidden_states.device
@@ -171,7 +172,8 @@ def internlm2_attention_forward(
         )
     # IPEX-LLM OPT: kv cache and quantzie kv cache
-    use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states,
+                                            self.num_heads, self.num_key_value_heads)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, hidden_states.device
@@ -346,7 +348,8 @@ def internlm_xcomposser2_attention_forward(
             query_states, key_states, cos, sin, position_ids, "internlm")
     # IPEX-LLM OPT: kv cache and quantzie kv cache
-    use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.wqkv, hidden_states,
+                                            self.num_heads, self.num_key_value_heads)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, device

ipex_llm/transformers/models/llama.py CHANGED Viewed

@@ -72,7 +72,7 @@ def llama_model_forward(
     use_cache = True if inputs.device.type == "xpu" else use_cache
     use_quantize_kv = use_quantize_kv_cache(
         self.layers[0].mlp.down_proj, inputs,
-        self.config.num_attention_heads // self.config.num_key_value_heads
+        self.config.num_attention_heads, self.config.num_key_value_heads
     )
     use_compresskv = should_use_compresskv(inputs, inputs.shape[1]) or \
         isinstance(past_key_values, DynamicCompressCache)

ipex_llm/transformers/models/minicpm.py CHANGED Viewed

@@ -159,7 +159,7 @@ def minicpm_model_forward_wrapper(origin_forward):
         # IPEX-LLM OPT: kv cache and quantize kv cache
         inputs = input_ids if input_ids is not None else inputs_embeds
         use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
-                                                self.config.num_attention_heads //
+                                                self.config.num_attention_heads,
                                                 self.config.num_key_value_heads)
         use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
             isinstance(past_key_values, DynamicCompressCache)

ipex_llm/transformers/models/minicpm3.py CHANGED Viewed

@@ -66,7 +66,9 @@ def minicpm3_model_forward_wrapper(origin_forward):
         inputs = input_ids if input_ids is not None else inputs_embeds
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         use_cache = True if inputs.device.type == "xpu" else use_cache
-        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
+        num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
+        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
+                                                num_heads, num_kv_heads)
         if use_cache:
             if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
                 past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)

ipex_llm/transformers/models/mistral.py CHANGED Viewed

@@ -71,7 +71,7 @@ def mistral_model_forward(
     use_cache = use_cache if use_cache is not None else self.config.use_cache
     use_cache = use_cache or inputs.device.type == 'xpu'
     use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
-                                            self.config.num_attention_heads //
+                                            self.config.num_attention_heads,
                                             self.config.num_key_value_heads)
     use_compress_kv = should_use_compresskv(inputs, inputs.size(1)) or \
         isinstance(past_key_values, DynamicCompressCache)

ipex_llm/transformers/models/mllama.py CHANGED Viewed

@@ -113,7 +113,7 @@ def mllama_text_model_forward(
     use_cache = True if inputs.device.type == "xpu" else use_cache
     use_quantize_kv = use_quantize_kv_cache(
         self.layers[0].mlp.down_proj, inputs,
-        self.config.num_attention_heads // self.config.num_key_value_heads
+        self.config.num_attention_heads, self.config.num_key_value_heads
     )
     if use_cache:
         if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):

ipex_llm/transformers/models/phi3.py CHANGED Viewed

@@ -249,7 +249,9 @@ def phi3_model_forward_wrapper(origin_model_forward):
         # IPEX-LLM OPT: kv cache and quantize kv cache and sdp
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         inputs = input_ids if input_ids is not None else inputs_embeds
-        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
+        num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
+        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
+                                                num_heads, num_kv_heads)
         use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
             isinstance(past_key_values, DynamicCompressCache)
         if use_cache:
@@ -305,7 +307,9 @@ def phi3v_model_forward_wrapper(origin_model_forward):
     ):
         # IPEX-LLM OPT: kv cache and quantize kv cache and sdp
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, input_ids)
+        num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
+        use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, input_ids,
+                                                num_heads, num_kv_heads)
         if use_cache:
             if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
                 past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)

ipex_llm/transformers/models/qwen.py CHANGED Viewed

@@ -107,7 +107,8 @@ def qwen_attention_forward(
         query_states = query_states * logn_tensor.type_as(query_states).expand_as(query_states)
     # IPEX-LLM OPT: kv cache and quantzie kv cache
-    use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states,
+                                            self.num_heads, self.num_heads)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, device
@@ -205,7 +206,8 @@ def qwen_attention_forward_registered(
         query_states = query_states * logn_tensor.type_as(query_states).expand_as(query_states)
     # IPEX-LLM OPT: kv cache and quantzie kv cache
-    use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.c_attn, hidden_states,
+                                            self.num_heads, self.num_heads)
     key_states, value_states = update_past_key_value(
         past_key_value, key_states, value_states,
         kv_seq_len, use_quantize_kv, device

ipex_llm/transformers/models/qwen2.py CHANGED Viewed

@@ -113,10 +113,10 @@ def qwen2_model_forward(
     # ipex-llm changes start
     # IPEX-LLM OPT: kv cache and quantize kv cache
     inputs = input_ids if input_ids is not None else inputs_embeds
+    num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
     use_quantize_kv = (
         self.config.hidden_size != 3584     # disable quantize kv in specific model
-        and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs,
-                                  self.config.num_attention_heads//self.config.num_key_value_heads)
+        and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs, num_heads, num_kv_heads)
     )
     use_compress_kv = should_use_compresskv(inputs, inputs.shape[1]) or \
         isinstance(past_key_values, DynamicCompressCache)
@@ -305,10 +305,11 @@ def qwen2_model_forward_4_42(
     # ipex-llm changes start
     # IPEX-LLM OPT: kv cache and quantize kv cache
+    num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
     use_quantize_kv = (
         self.config.hidden_size != 3584     # disable quantize kv in specific model
         and use_quantize_kv_cache(self.layers[0].mlp.up_proj, inputs_embeds,
-                                  self.config.num_attention_heads//self.config.num_key_value_heads)
+                                  num_heads, num_kv_heads)
     )
     use_compress_kv = should_use_compresskv(inputs_embeds, inputs_embeds.shape[1]) or \
         isinstance(past_key_values, DynamicCompressCache)

ipex_llm/transformers/models/qwen2_moe.py CHANGED Viewed

@@ -73,8 +73,10 @@ def qwen2moe_model_forward(
     return_dict: Optional[bool] = None,
 ):
     use_cache = use_cache if use_cache is not None else self.config.use_cache
-    input = input_ids if input_ids is not None else inputs_embeds
-    use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, input)
+    inputs = input_ids if input_ids is not None else inputs_embeds
+    num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
+    use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, inputs,
+                                            num_heads, num_kv_heads)
     if use_cache:
         if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
             past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)

ipex_llm/transformers/models/qwen2_vl.py CHANGED Viewed

@@ -88,7 +88,9 @@ def qwen2_vl_model_forward(
     # IPEX-LLM OPT start: kv cache and quantize kv cache
     inputs = input_ids if input_ids is not None else inputs_embeds
     use_cache = True if inputs.device.type == "xpu" else use_cache
-    use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs)
+    num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
+    use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.down_proj, inputs,
+                                            num_heads, num_kv_heads)
     if use_cache:
         if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
             past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)

ipex_llm/transformers/models/stablelm.py CHANGED Viewed

@@ -69,8 +69,10 @@ def stablelm_model_forward(
 ):
     # IPEX-LLM OPT: kv cache and quantize kv cache
     use_cache = use_cache if use_cache is not None else self.config.use_cache
+    num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
     use_quantize_kv = (self.layers[0].self_attn.head_dim in [64, 80, 96, 128]
-                       and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids))
+                       and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids,
+                                                 num_heads, num_kv_heads))
     if use_cache:
         if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
             past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)

ipex_llm/transformers/models/starcoder2.py CHANGED Viewed

@@ -132,7 +132,9 @@ def model_forward(
     return_dict: Optional[bool] = None,
 ):
     use_cache = use_cache if use_cache is not None else self.config.use_cache
-    use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.c_fc, input_ids)
+    num_heads, num_kv_heads = self.config.num_attention_heads, self.config.num_key_value_heads
+    use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.c_fc, input_ids,
+                                            num_heads, num_kv_heads)
     if use_cache:
         if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache):
             past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values)

ipex_llm/transformers/models/utils.py CHANGED Viewed

@@ -74,7 +74,8 @@ def append_kv_cache(cache_k, cache_v, key_states, value_states):
     return new_cache_k, new_cache_v
-def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: int = 1) -> bool:
+def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
+                          num_heads: int, num_kv_heads: int) -> bool:
     if os.environ.get("BIGDL_QUANTIZE_KV_CACHE", None) is not None:
         warnings.warn(
             "`BIGDL_QUANTIZE_KV_CACHE` is deprecated and will be removed in future releases. "
@@ -90,8 +91,11 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor, kv_group: in
     else:
         device_name = get_xpu_device_name(x.device)
         return (
-            device_name in ["mtl", "lnl", "arl"] and kv_group == 1
-            or device_name in ["arc", "bmg"] and x.size(0) > 1
+            num_kv_heads >= 4
+            and (
+                device_name in ["mtl", "lnl", "arl"] and num_heads // num_kv_heads <= 4
+                or device_name in ["arc", "bmg"] and x.size(0) > 1
+            )
         )

ipex_llm/transformers/models/yuan.py CHANGED Viewed

@@ -158,7 +158,8 @@ def yuan_attention_forward(
                                                         "yuan")
     # IPEX-LLM OPT: kv cache and quantzie kv cache
-    use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states)
+    use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states,
+                                            self.num_heads, self.num_heads)
     key_states, value_states = update_past_key_value(
         None if past_key_value is None else (past_key_value[0], past_key_value[1]),
         key_states, value_states,

{ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ipex-llm
-Version: 2.2.0b20250108
+Version: 2.2.0b20250109
 Summary: Large Language Model Develop Toolkit
 Home-page: https://github.com/intel-analytics/ipex-llm
 Author: BigDL Authors
@@ -27,10 +27,10 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
 Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
 Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
 Provides-Extra: cpp
-Requires-Dist: bigdl-core-cpp ==2.6.0b20250108 ; extra == 'cpp'
+Requires-Dist: bigdl-core-cpp ==2.6.0b20250109 ; extra == 'cpp'
 Requires-Dist: setuptools ; extra == 'cpp'
 Provides-Extra: cpp-arl
-Requires-Dist: bigdl-core-cpp ==2.6.0b20250108 ; extra == 'cpp-arl'
+Requires-Dist: bigdl-core-cpp ==2.6.0b20250109 ; extra == 'cpp-arl'
 Requires-Dist: setuptools ; extra == 'cpp-arl'
 Requires-Dist: onednn-devel ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
 Requires-Dist: onednn ==2024.1.1 ; (platform_system == "Windows") and extra == 'cpp-arl'
@@ -67,7 +67,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
 Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
 Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
-Requires-Dist: bigdl-core-npu ==2.6.0b20250108 ; (platform_system == "Windows") and extra == 'npu'
+Requires-Dist: bigdl-core-npu ==2.6.0b20250109 ; (platform_system == "Windows") and extra == 'npu'
 Provides-Extra: serving
 Requires-Dist: py-cpuinfo ; extra == 'serving'
 Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
@@ -87,9 +87,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
 Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
 Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
 Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250108 ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250108 ; extra == 'xpu'
-Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250108 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250109 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250109 ; extra == 'xpu'
+Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250109 ; extra == 'xpu'
 Provides-Extra: xpu-2-1
 Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
 Requires-Dist: protobuf ; extra == 'xpu-2-1'
@@ -104,9 +104,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
 Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
 Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
 Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250108 ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250108 ; extra == 'xpu-2-1'
-Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250108 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
+Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250109 ; extra == 'xpu-2-1'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
 Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
 Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
@@ -124,7 +124,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
 Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
 Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
 Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
-Requires-Dist: bigdl-core-xe-all ==2.6.0b20250108 ; extra == 'xpu-2-6'
+Requires-Dist: bigdl-core-xe-all ==2.6.0b20250109 ; extra == 'xpu-2-6'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-6'
 Provides-Extra: xpu-arc
 Requires-Dist: py-cpuinfo ; extra == 'xpu-arc'
@@ -137,9 +137,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
 Requires-Dist: tabulate ; extra == 'xpu-arc'
 Requires-Dist: setuptools ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250108 ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250108 ; extra == 'xpu-arc'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250108 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-arc'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
@@ -160,9 +160,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
 Requires-Dist: tabulate ; extra == 'xpu-arl'
 Requires-Dist: setuptools ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250108 ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250108 ; extra == 'xpu-arl'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250108 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-arl'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
@@ -183,9 +183,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
 Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
 Requires-Dist: tabulate ; extra == 'xpu-lnl'
 Requires-Dist: setuptools ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250108 ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250108 ; extra == 'xpu-lnl'
-Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250108 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
+Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250109 ; extra == 'xpu-lnl'
 Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
 Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
 Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'

{ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/RECORD RENAMED Viewed

@@ -94,7 +94,7 @@ ipex_llm/serving/fastchat/tgi_api_protocol.py,sha256=brT3k3-V0NJrU4fRqUwWjC0O3iO
 ipex_llm/serving/fastchat/tgi_api_server.py,sha256=agNTAEiZPSuj3dEdIdYKwkoY0cXOUDX06DiM9VP2knQ,24418
 ipex_llm/serving/fastchat/vllm_worker.py,sha256=ZLz2Q9GxJO6r_LOiP6epgCRjBGk-K4EB1SNEWSJp5DA,11091
 ipex_llm/transformers/__init__.py,sha256=l4KkMkLe-pRC7b_kj6LCfeifgE-Uo33_Av_FwN9HnFA,1074
-ipex_llm/transformers/convert.py,sha256=APf2uHMgEeiAhsKm9dPgPWlyO0ADq2yHtZgovv9oczU,99101
+ipex_llm/transformers/convert.py,sha256=umI137wqV2d4itS0AJQoZcygeWBATpSJSDJ805cZ-SY,98499
 ipex_llm/transformers/convert_ipex.py,sha256=iKXo0n8fVFTOA2fNYYrByMFK0dovL-kLd2sVDk88AlQ,14334
 ipex_llm/transformers/embedding.py,sha256=bdgk59DvD4ZZyxRzewXOR7g56nThgO6uhIwk8QL7f-s,9299
 ipex_llm/transformers/kv.py,sha256=k4TU18LlA-Sbq9WNNQnfuzu3RSFBwFhmaV3BcGN5bAo,19191
@@ -144,45 +144,45 @@ ipex_llm/transformers/gguf/models/model_implement/yuan2/configuration_yuan.py,sh
 ipex_llm/transformers/gguf/models/model_implement/yuan2/yuan_hf_model.py,sha256=_AOGMV65XHxgTxIib7lgs49InopcecTzRwgtYR8NTUg,51084
 ipex_llm/transformers/models/__init__.py,sha256=tp2DcVkKg1-QvdYk7DY7rZvQWCDQ4ZjU8NAQ7Fclrpg,584
 ipex_llm/transformers/models/aquila.py,sha256=VZb5Drpo_fTxwcExZ397LygnsNPX2sVbie9_JeFudZI,5252
-ipex_llm/transformers/models/baichuan.py,sha256=oJCAEENSG8oQhJ-QPN2SiapARjAGdOM6nEbyCcYOMCo,19334
-ipex_llm/transformers/models/bert.py,sha256=bJNic2pt1kph0kBwdK5MRGyWupFfx2Ts0V3D1L-5kWo,6085
+ipex_llm/transformers/models/baichuan.py,sha256=cAQLmVG-3R8CSTGTcDy2JOOzVe-Ej8AXjIEIjvZBGlo,18376
+ipex_llm/transformers/models/bert.py,sha256=0Mm9jkvkzBxtc_z_GE1TcZoPz-HOg2Z2973ZEWgSwJk,5601
 ipex_llm/transformers/models/bloom.py,sha256=PxfzyYT-nFn3K5rZhTQjmcEjUUzAhUFzxIN4kzRlCuc,8103
 ipex_llm/transformers/models/chatglm.py,sha256=UHai1t2AUtGmF765_eHF8LUMVQzp_oCBx8TJB21WrHk,12597
-ipex_llm/transformers/models/chatglm2.py,sha256=SGCABJdYQLW0zDarEoWrEQLuWlbq9iQhYU8ZeR1-ptQ,15957
-ipex_llm/transformers/models/chatglm4.py,sha256=AAhAFFDDas5DBQPfh2Mwl7a2v7taKf6xphoeeNNFaBI,16593
-ipex_llm/transformers/models/chatglm4v.py,sha256=tyjDDyF6FEgLAT24EG3i4-auxZvkwmeLIy0Hds4K5Yo,14105
-ipex_llm/transformers/models/common.py,sha256=4obQMGF02FCiXrHnFle9Fsx7C33b1FDt37qJJ4YgxRc,11578
+ipex_llm/transformers/models/chatglm2.py,sha256=KyAIX7zGVQDQuwwM3QMBNWZbTeMHEzKUIgAryT0voHc,14933
+ipex_llm/transformers/models/chatglm4.py,sha256=QvUehdaCePB3MNHyWg3dneDxmjtBdxYeKUyQUVcsgfM,16886
+ipex_llm/transformers/models/chatglm4v.py,sha256=L6y45M_wjS2_HqchmCUxRlQZUNuSNCGOiynAQrGh918,14124
+ipex_llm/transformers/models/common.py,sha256=Q3IEfGqvxoHyfIIF5s8qHmOJBBP3b2jyVAXk8C3b1Pg,11636
 ipex_llm/transformers/models/decilm.py,sha256=P-PBuDPf07GvKggLwJx_wPwIn6esN3rX8ai2JxRuZmE,5246
 ipex_llm/transformers/models/gemma.py,sha256=_E3Yw8Y45xyNVeLqyVKcpr8kjuICtETeL82cJ-bWJuU,9424
 ipex_llm/transformers/models/gemma2.py,sha256=2WZuv-FLzJyTJFaYxOuzJt47QE64M0lHnzAiO5T6ozI,8049
-ipex_llm/transformers/models/glm.py,sha256=PE43uks9lojndBBHFVXK1VWisHhbY-kuCmhq0CwmD4s,7204
+ipex_llm/transformers/models/glm.py,sha256=lmeEWd_W2O638VzVW4Gm6cJre5XZcg_QBmPs8NWqXsM,7202
 ipex_llm/transformers/models/gpt2.py,sha256=YSaNgK1uLCFDuIFqnKO0Mi-AsOZsYav-7pNf_NpKGdM,3445
 ipex_llm/transformers/models/gptbigcode.py,sha256=cP1_qGWoa43R2WacAMblShjku4QupcCZiLaPPAoOUs4,9101
 ipex_llm/transformers/models/gptneox.py,sha256=loRh1x_5S6BCeOr_s5xr-N_1SQHL3Y5IiUBAEyoMUqQ,6172
-ipex_llm/transformers/models/internlm.py,sha256=ZbIUMDwNRcrCeduXfbA_uq1AUEWawEt6CJRvQl3LkAg,17832
+ipex_llm/transformers/models/internlm.py,sha256=OifyiobRligleyZLpLBSe44A6Sq0uMG-8-NOcRCcT4Q,18080
 ipex_llm/transformers/models/internvl.py,sha256=Vx0vENIEQLX2M6P398mw5TOhpks0U8xf8rtRQvy94go,8154
-ipex_llm/transformers/models/llama.py,sha256=n1JG1uElMB8t3Hpae94S6YTO_5q2N5BUAhb7mncvA6E,8560
-ipex_llm/transformers/models/minicpm.py,sha256=_eYBYafQxnuqKo9ENNkua73KU5goU2z-dkaLlF5uHnA,10147
-ipex_llm/transformers/models/minicpm3.py,sha256=FhNS6mi2rg7dSdF_QQGrao3g9EC6XLn1MTKd-kd0wF0,9191
+ipex_llm/transformers/models/llama.py,sha256=NzpyQve_RC9ez1W-jWPLGZ80k_S1I5Rx5saAzCsDIoI,8558
+ipex_llm/transformers/models/minicpm.py,sha256=eaPNVNrep0_xGoELhZd886ff0ceoKqB6cusdAhd52eE,10145
+ipex_llm/transformers/models/minicpm3.py,sha256=11cYl8KM2hoIJNMAOZMxiwCu6dMhup9ric_OEn8-VrQ,9363
 ipex_llm/transformers/models/minicpmv.py,sha256=PP05b5iTnrMpiseCn8iJcxKJDnfq7WqXp9Mrch0kKZ0,9876
-ipex_llm/transformers/models/mistral.py,sha256=rE1GWQxXvF6aG-buPHDR13zeynDZEDIubPF4PiVhZbM,7451
-ipex_llm/transformers/models/mllama.py,sha256=ogpLmmN_OwcFUyjYB-oDC-l3uw8urFvUEc5edkjWHAk,10939
+ipex_llm/transformers/models/mistral.py,sha256=uVhkdXaq15v1P3QY0emVsA7SxUbAWChHEEXYN-drjpQ,7449
+ipex_llm/transformers/models/mllama.py,sha256=ZyRq9DTKsvk1AlRbr-z6ngjS3Sr_7YuGZ6-Yr1MBBAM,10937
 ipex_llm/transformers/models/mpt.py,sha256=z02NwHogJZVh-Mk4sYoIzR90SFIKhoNN_-ifsD907TQ,9540
 ipex_llm/transformers/models/phi.py,sha256=E6qz4EEuHIVGvaPo-wtLC5lz3iyMqTbAE_cRlcjQRKI,6670
-ipex_llm/transformers/models/phi3.py,sha256=jkiadJ85ToHpymY5GOM6orWlnx6LKN8_-v1MUcfGWPg,15159
+ipex_llm/transformers/models/phi3.py,sha256=Fo6PlZ24Gdm7eeeZOTMm1Bfh3U6P4rvq7-_2FHvp0vE,15503
 ipex_llm/transformers/models/phixtral.py,sha256=MDTMghcu7qAmZmRcUGqXXDXhSU3y_N59HRIXmlcjp5g,4890
-ipex_llm/transformers/models/qwen.py,sha256=XIJ_bLzediBURWU-OOS3H6WBIGXQue6jDdUHJsAabwY,19391
-ipex_llm/transformers/models/qwen2.py,sha256=b49HO4GSudwGJ3n6uHVno1oo3DgRt3jOjtQnLOB3cdY,25530
-ipex_llm/transformers/models/qwen2_moe.py,sha256=EA_OYxYAEgrvi7VpDW192AJXG9Fwe2aBtOAZPkOAJk4,19350
-ipex_llm/transformers/models/qwen2_vl.py,sha256=jIm4yZSd751BkRqgj3wR1QBkDIh-TMCLAMM8SZ8n6Qo,13419
+ipex_llm/transformers/models/qwen.py,sha256=A3WiVCzA7NLkcjp4zhFkZvKZzZWZlg0WFuVV_556TAI,19543
+ipex_llm/transformers/models/qwen2.py,sha256=JLaY9ZT7A22oO0G8K-nvjvKQDaIrKA5o-jEHvk_y3eI,25604
+ipex_llm/transformers/models/qwen2_moe.py,sha256=a0gYo-ngf8SxaEnBdZUJDnPS6Mkn_poDd8xqhx50icI,19516
+ipex_llm/transformers/models/qwen2_vl.py,sha256=NrhxlaPj7W-HUBmKc3CSTwZy1lkoZ9qDaxM4GvE0kHs,13583
 ipex_llm/transformers/models/qwen_vl.py,sha256=j7Nzzz2Qvynu9yrCXmoEfERjw43hXof5TbXIs7Ms-oY,17105
 ipex_llm/transformers/models/rwkv4.py,sha256=H4KMtxN0JA2ZTXnonHpsUUJ5xULemo-D1Jzl0ri_UY8,6123
 ipex_llm/transformers/models/rwkv5.py,sha256=OkRNj1pCAZg1z2Fw-I0DEnxLEdZyPeRSQ6msrkxLOCs,10710
 ipex_llm/transformers/models/sd.py,sha256=VvHV5u-0k2MgHu3NL9113hPj7DgfxqctuKzEEeNfRDU,5981
-ipex_llm/transformers/models/stablelm.py,sha256=RGQCYuQhYqtZ1j3RZkYi0_QvCRnUgUIPYxfBcLnElzg,6885
-ipex_llm/transformers/models/starcoder2.py,sha256=4P3mhRYf2Kreb1ESjrQGfy1puLMmZXgV35zf-Tksvao,6462
-ipex_llm/transformers/models/utils.py,sha256=isBCMMQP3j_opmda9XzD_dPk1ejvEXTztggbu1yIMSc,15439
-ipex_llm/transformers/models/yuan.py,sha256=1jRPebwAK2ENbyYokOmb4LSVo-szucWiygz9zTv-scs,7656
+ipex_llm/transformers/models/stablelm.py,sha256=fj-XtOnR6kggnFUQTMPCOOzolkPztN06WAv8QW-XRnI,7054
+ipex_llm/transformers/models/starcoder2.py,sha256=ONKvD7JCkRM0DI-R56x28QFBJ7CjD5hOZBQ_3WfOcNk,6626
+ipex_llm/transformers/models/utils.py,sha256=ihbWS5kQK2KHDVPkMhgjik3nM8B2fWf-E-z4BWNUstk,15568
+ipex_llm/transformers/models/yuan.py,sha256=JYAn_ZaSGK0NBJLEIxCACfAq084a66GFJkdd5NbpmMA,7732
 ipex_llm/transformers/npu_models/__init__.py,sha256=ulEUGLjaP48LCrVeury3UxLjXxKzRi0UpSG4bYu-7f8,585
 ipex_llm/transformers/npu_models/baichuan.py,sha256=fJtd7fBrttySghRUgfZTAdxLjsSNC-XL08HISsXigLE,4685
 ipex_llm/transformers/npu_models/baichuan_mp.py,sha256=tHhO-0v5z6IhxsfzAPYWXVbLrV_4z89DIb4JjE3207M,45026
@@ -250,11 +250,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
 ipex_llm/vllm/xpu/engine/engine.py,sha256=k4-D27WS_Gk3mA--w3HWAjPjb4Aiu043MVPi0ZoAUBc,5984
 ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=GshTZFB8e4PWvqckfbmTOU6b0oLkNn7A-vzLuG9--j8,21544
 ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=2rENA2ucynMaIjiZBEh2ez1o5vR32GaP514t39CD7KM,8676
-ipex_llm-2.2.0b20250108.data/scripts/ipex-llm-init,sha256=fLQsT2dRL6H5bThb4GuIWotAuqoLsIxFwA-0c2qmaO8,6672
-ipex_llm-2.2.0b20250108.data/scripts/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
-ipex_llm-2.2.0b20250108.data/scripts/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
-ipex_llm-2.2.0b20250108.dist-info/METADATA,sha256=NJp_uuPOJe8b5UQ8ASJbfzen2BGoc2DEM1ZInzr0X9E,12705
-ipex_llm-2.2.0b20250108.dist-info/WHEEL,sha256=PPJcBMAZibF_2GFE9NmOJGqiaSMPiNFbJd6QaJjdA6Y,109
-ipex_llm-2.2.0b20250108.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
-ipex_llm-2.2.0b20250108.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
-ipex_llm-2.2.0b20250108.dist-info/RECORD,,
+ipex_llm-2.2.0b20250109.data/scripts/ipex-llm-init,sha256=fLQsT2dRL6H5bThb4GuIWotAuqoLsIxFwA-0c2qmaO8,6672
+ipex_llm-2.2.0b20250109.data/scripts/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
+ipex_llm-2.2.0b20250109.data/scripts/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
+ipex_llm-2.2.0b20250109.dist-info/METADATA,sha256=gPslIWSw_X5E5ULhQa8rOHeRo_UeBDXCAyPjBSPB-nU,12705
+ipex_llm-2.2.0b20250109.dist-info/WHEEL,sha256=PPJcBMAZibF_2GFE9NmOJGqiaSMPiNFbJd6QaJjdA6Y,109
+ipex_llm-2.2.0b20250109.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
+ipex_llm-2.2.0b20250109.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
+ipex_llm-2.2.0b20250109.dist-info/RECORD,,

{ipex_llm-2.2.0b20250108.data → ipex_llm-2.2.0b20250109.data}/scripts/ipex-llm-init RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250108.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-chat RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250108.data → ipex_llm-2.2.0b20250109.data}/scripts/llm-cli RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/WHEEL RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ipex_llm-2.2.0b20250108.dist-info → ipex_llm-2.2.0b20250109.dist-info}/top_level.txt RENAMED Viewed

File without changes