PyPI - ipex-llm - Versions diffs - 2.2.0b20250211__py3-none-win_amd64.whl → 2.2.0b20250213__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250211__py3-none-win_amd64.whl → 2.2.0b20250213__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/convert.py +6 -4
ipex_llm/transformers/models/janus.py +49 -0
ipex_llm/transformers/models/utils.py +1 -1
ipex_llm/vllm/xpu/engine/engine.py +117 -20
ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +379 -95
ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py +57 -8
ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py +23 -0
ipex_llm/vllm/xpu/model_convert.py +25 -19
{ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250213.dist-info}/METADATA +19 -19
{ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250213.dist-info}/RECORD +45 -43
{ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250213.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250213.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250213.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250213.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250213.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250213.dist-info}/top_level.txt +0 -0

ipex_llm/libs/bloom-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/bloom.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama.dll CHANGED Viewed

Binary file

ipex_llm/libs/main-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/pipeline.dll CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/starcoder-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/starcoder.dll CHANGED Viewed

Binary file

ipex_llm/transformers/convert.py CHANGED Viewed

@@ -70,8 +70,9 @@ def is_auto_awq_available():
 def is_vllm_available():
     global _IS_VLLM_AVAILABLE
+    _IS_VLLM_AVAILABLE = os.getenv("IPEX_LLM_NOT_USE_VLLM", None)
     if _IS_VLLM_AVAILABLE is not None:
-        return _IS_VLLM_AVAILABLE
+        return False
     import sys
     original_path = sys.path
     # Temporally remove current directory
@@ -667,7 +668,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                             out_features,
                             mp_group,
                             None,
-                            None,
                             optimize_lm_head,
                             None
                         )
@@ -1066,7 +1066,7 @@ def _optimize_pre(model, qtype=None):
         from ipex_llm.transformers.models.baichuan_m1 import pre_register_inv_freq
         model.apply(pre_register_inv_freq)
     elif model.config.model_type == "multi_modality":
-        pass
+        _optimize_pre(model.language_model)
     return model
@@ -2012,8 +2012,10 @@ def _optimize_post(model):
         # vision
         vpm_modeling_module_name = model.vision_model.vision_tower.__class__.__module__
         vpm_module = importlib.import_module(vpm_modeling_module_name)
         from ipex_llm.transformers.models.janus import vision_attention_forward
         convert_forward(model.vision_model, vpm_module.Attention, vision_attention_forward)
+        # llm
+        _optimize_post(model.language_model)
     return model

ipex_llm/transformers/models/janus.py ADDED Viewed

@@ -0,0 +1,49 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is adapted from
+# https://github.com/deepseek-ai/Janus/blob/main/janus/models/siglip_vit.py
+import torch
+from ipex_llm.transformers.models.common import scaled_dot_product_attention
+def vision_attention_forward(self, x: torch.Tensor) -> torch.Tensor:
+    B, N, C = x.shape
+    qkv = (
+        self.qkv(x)
+        .reshape(B, N, 3, self.num_heads, self.head_dim)
+        .permute(2, 0, 3, 1, 4)
+    )
+    q, k, v = qkv.unbind(0)
+    q, k = self.q_norm(q), self.k_norm(k)
+    if self.fused_attn:
+        # ipex-llm opt: sdpa
+        x = scaled_dot_product_attention(
+            q, k.contiguous(), v.contiguous(), None, False
+        )
+    else:
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = attn @ v
+    x = x.transpose(1, 2).reshape(B, N, C)
+    x = self.proj(x)
+    x = self.proj_drop(x)
+    return x

ipex_llm/transformers/models/utils.py CHANGED Viewed

@@ -86,7 +86,7 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
         return os.environ["IPEX_LLM_QUANTIZE_KV_CACHE"] == "1"
     elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
         return os.environ["IPEX_LLM_LOW_MEM"] == "1"
-    elif linear.qtype in [ggml_tensor_qtype["fp16"], ggml_tensor_qtype["bf16"]]:
+    elif linear.weight.dtype != torch.uint8:    # unquantized
         return False
     else:
         device_name = get_xpu_device_name(x.device)

ipex_llm/vllm/xpu/engine/engine.py CHANGED Viewed

@@ -13,18 +13,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from typing import Dict, Optional
+from vllm.logger import init_logger
+from typing import Dict, Optional, Any, Union, Type
 from vllm.engine.llm_engine import LLMEngine
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.llm import LLM
 from vllm.utils import Counter
-from vllm.config import EngineConfig
+from vllm.config import VllmConfig
 from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert
 from vllm.usage.usage_lib import UsageContext
 from vllm.engine.metrics import StatLoggerBase
 from vllm.engine.multiprocessing.engine import MQLLMEngine
 import signal
+from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
+                                   TaskOption)
+from vllm.config import CompilationConfig
+from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+from vllm import envs
+from vllm.v1.engine.async_llm import AsyncLLM
+import os
+logger = init_logger(__name__)
 class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
@@ -35,7 +45,7 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
     def from_engine_args(
         cls,
         engine_args: AsyncEngineArgs,
-        engine_config: Optional[EngineConfig] = None,
+        engine_config: Optional[VllmConfig] = None,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         load_in_low_bit: str = "sym_int4",
@@ -49,6 +59,27 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
                                         usage_context=usage_context, stat_loggers=stat_loggers)
+class IPEXLLMAsyncV1Engine(AsyncLLM):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: AsyncEngineArgs,
+        engine_config: Optional[VllmConfig] = None,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        load_in_low_bit: str = "sym_int4",
+        stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,  # noqa
+    ) -> "AsyncLLM":
+        _ipex_llm_convert(load_in_low_bit)
+        return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
+                                        start_engine_loop=start_engine_loop,
+                                        usage_context=usage_context, stat_loggers=stat_loggers)
 class IPEXLLMClass(LLM):
     def __init__(
         self,
@@ -57,6 +88,7 @@ class IPEXLLMClass(LLM):
         tokenizer_mode: str = "auto",
         skip_tokenizer_init: bool = False,
         trust_remote_code: bool = False,
+        allowed_local_media_path: str = "",
         tensor_parallel_size: int = 1,
         dtype: str = "auto",
         quantization: Optional[str] = None,
@@ -64,28 +96,48 @@ class IPEXLLMClass(LLM):
         tokenizer_revision: Optional[str] = None,
         seed: int = 0,
         gpu_memory_utilization: float = 0.9,
-        swap_space: int = 4,
+        swap_space: float = 4,
         cpu_offload_gb: float = 0,
-        enforce_eager: bool = False,
-        max_context_len_to_capture: Optional[int] = None,
+        enforce_eager: Optional[bool] = None,
         max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
+        disable_async_output_proc: bool = True,
+        hf_overrides: Optional[HfOverrides] = None,
+        mm_processor_kwargs: Optional[Dict[str, Any]]=None,
+        # After positional args are removed, move this right below `model`
+        task: TaskOption = "auto",
+        override_pooler_config: Optional[PoolerConfig] = None,
+        compilation_config: Optional[Union[int, Dict[str, Any]]]=None,
         load_in_low_bit: str = "sym_int4",
         **kwargs,
     ) -> None:
+        '''
+        LLM constructor.
+        Note: if enforce_eager is unset (enforce_eager is None)
+        it defaults to False.
+        '''
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
-        removed_vision_keys = ("image_token_id", "image_feature_size",
-                               "image_input_shape", "image_input_type")
-        if any(k in kwargs for k in removed_vision_keys):
-            raise TypeError(  # noqa
-                "There is no need to pass vision-related arguments anymore.")
+        if compilation_config is not None:
+            if isinstance(compilation_config, (int, dict)):
+                compilation_config_instance = CompilationConfig.from_cli(
+                    str(compilation_config))
+            else:
+                compilation_config_instance = compilation_config
+        else:
+            compilation_config_instance = None
         engine_args = EngineArgs(
             model=model,
+            task=task,
             tokenizer=tokenizer,
             tokenizer_mode=tokenizer_mode,
             skip_tokenizer_init=skip_tokenizer_init,
             trust_remote_code=trust_remote_code,
+            allowed_local_media_path=allowed_local_media_path,
             tensor_parallel_size=tensor_parallel_size,
             dtype=dtype,
             quantization=quantization,
@@ -96,16 +148,53 @@ class IPEXLLMClass(LLM):
             swap_space=swap_space,
             cpu_offload_gb=cpu_offload_gb,
             enforce_eager=enforce_eager,
-            max_context_len_to_capture=max_context_len_to_capture,
             max_seq_len_to_capture=max_seq_len_to_capture,
             disable_custom_all_reduce=disable_custom_all_reduce,
+            disable_async_output_proc=disable_async_output_proc,
+            hf_overrides=hf_overrides,
+            mm_processor_kwargs=mm_processor_kwargs,
+            override_pooler_config=override_pooler_config,
+            compilation_config=compilation_config_instance,
             **kwargs,
         )
-        self.llm_engine = IPEXLLMLLMEngine.from_engine_args(
+        # Logic to switch between engines is done at runtime instead of import
+        # to avoid import order issues
+        self.engine_class = self.get_engine_class()
+        self.llm_engine = self.engine_class.from_engine_args(
             engine_args, usage_context=UsageContext.LLM_CLASS,
             load_in_low_bit=load_in_low_bit)
         self.request_counter = Counter()
+    @staticmethod
+    def get_engine_class() -> Type[LLMEngine]:
+        if envs.VLLM_USE_V1:
+            return IPEXLLMLLMV1Engine
+        return IPEXLLMLLMEngine
+class IPEXLLMLLMV1Engine(V1LLMEngine):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: EngineArgs,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
+        enable_multiprocessing: bool = False,
+        load_in_low_bit: str = "sym_int4",
+    ) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+        # Create the engine configs.
+        _ipex_llm_convert(load_in_low_bit)
+        return super().from_engine_args(engine_args,
+                                        usage_context,
+                                        stat_loggers,
+                                        enable_multiprocessing)
 class IPEXLLMLLMEngine(LLMEngine):
     def __init__(self, *args, **kwargs):
@@ -134,16 +223,24 @@ class IPEXLLMMQLLMEngine(MQLLMEngine):
 def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
-                  ipc_path: str, load_in_low_bit: str):
+                  ipc_path: str, load_in_low_bit: str, engine_alive):
     def signal_handler(*_) -> None:
         # Interrupt server on sigterm
         raise KeyboardInterrupt("MQLLMEngine terminated")  # noqa
-    signal.signal(signal.SIGTERM, signal_handler)
+    try:
+        signal.signal(signal.SIGTERM, signal_handler)
+        engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
+                                                     usage_context=usage_context,
+                                                     ipc_path=ipc_path,
+                                                     load_in_low_bit=load_in_low_bit)
+        engine.start()
+    except BaseException as e:
+        logger.exception(e)
+        engine_alive.value = False
+        raise e  # noqa
-    engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
-                                                 usage_context=usage_context,
-                                                 ipc_path=ipc_path,
-                                                 load_in_low_bit=load_in_low_bit)
-    engine.start()
+if os.getenv("VLLM_USE_V1"):
+    IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine