PyPI - ipex-llm - Versions diffs - 2.3.0b20250427__py3-none-win_amd64.whl → 2.3.0b20250501__py3-none-win_amd64.whl - Mend

ipex-llm 2.3.0b20250427__py3-none-win_amd64.whl → 2.3.0b20250501__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/convert.py +3 -2
ipex_llm/vllm/xpu/engine/__init__.py +3 -1
ipex_llm/vllm/xpu/engine/engine.py +163 -19
ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +448 -180
ipex_llm/vllm/xpu/model_convert.py +5 -2
{ipex_llm-2.3.0b20250427.dist-info → ipex_llm-2.3.0b20250501.dist-info}/METADATA +11 -11
{ipex_llm-2.3.0b20250427.dist-info → ipex_llm-2.3.0b20250501.dist-info}/RECORD +42 -42
{ipex_llm-2.3.0b20250427.data → ipex_llm-2.3.0b20250501.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.3.0b20250427.data → ipex_llm-2.3.0b20250501.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.3.0b20250427.data → ipex_llm-2.3.0b20250501.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.3.0b20250427.dist-info → ipex_llm-2.3.0b20250501.dist-info}/WHEEL +0 -0
{ipex_llm-2.3.0b20250427.dist-info → ipex_llm-2.3.0b20250501.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.3.0b20250427.dist-info → ipex_llm-2.3.0b20250501.dist-info}/top_level.txt +0 -0

ipex_llm/libs/bloom-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/bloom.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama.dll CHANGED Viewed

Binary file

ipex_llm/libs/main-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/pipeline.dll CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/starcoder-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/starcoder.dll CHANGED Viewed

Binary file

ipex_llm/transformers/convert.py CHANGED Viewed

@@ -150,12 +150,13 @@ def is_linear_module(module):
         if _VLLM_VERSION is None:
             _VLLM_VERSION = get_package_version('vllm')
         from vllm.model_executor.layers.linear import (
-            ColumnParallelLinear, RowParallelLinear, QKVParallelLinear, MergedColumnParallelLinear
+            ColumnParallelLinear, RowParallelLinear, QKVParallelLinear,
+            MergedColumnParallelLinear, ReplicatedLinear
         )
         from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
         VLLM_LINEAR_LIST = [
             ColumnParallelLinear, RowParallelLinear, QKVParallelLinear,
-            MergedColumnParallelLinear,
+            MergedColumnParallelLinear, ReplicatedLinear,
         ]
         if 'xpu' in _VLLM_VERSION:
             VLLM_LINEAR_LIST.append(ParallelLMHead)

ipex_llm/vllm/xpu/engine/__init__.py CHANGED Viewed

@@ -13,10 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine
+from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine, IPEXLLMAsyncV1Engine, IPEXLLMLLMV1Engine
 __all__ = [
     "IPEXLLMAsyncLLMEngine",
     "IPEXLLMLLMEngine",
     "IPEXLLMClass",
+    "IPEXLLMAsyncV1Engine",
+    "IPEXLLMLLMV1Engine",
     "run_mp_engine",
 ]

ipex_llm/vllm/xpu/engine/engine.py CHANGED Viewed

@@ -38,6 +38,8 @@ logger = init_logger(__name__)
 class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
+    _is_converted = False
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -53,13 +55,39 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
     ) -> "AsyncLLMEngine":
         """Creates an async LLM engine from the engine arguments."""
         # Create the engine configs.
-        _ipex_llm_convert(load_in_low_bit)
+        if not cls._is_converted:
+            _ipex_llm_convert(load_in_low_bit)
+            cls._is_converted = True
         return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
                                         start_engine_loop=start_engine_loop,
                                         usage_context=usage_context, stat_loggers=stat_loggers)
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[dict[str, StatLoggerBase]]=None,
+        disable_log_requests: bool = False,
+        disable_log_stats: bool = False,
+        load_in_low_bit: str = "sym_int4",
+    ) -> "AsyncLLMEngine":
+        if not cls._is_converted:
+            _ipex_llm_convert(load_in_low_bit)
+            cls._is_converted = True
+        return super().from_vllm_config(
+            vllm_config=vllm_config,
+            start_engine_loop=start_engine_loop,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+            disable_log_requests=disable_log_requests,
+            disable_log_stats=disable_log_stats,
+        )
 class IPEXLLMAsyncV1Engine(AsyncLLM):
+    _is_converted = False
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -74,13 +102,39 @@ class IPEXLLMAsyncV1Engine(AsyncLLM):
         load_in_low_bit: str = "sym_int4",
         stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,  # noqa
     ) -> "AsyncLLM":
-        _ipex_llm_convert(load_in_low_bit)
+        if not cls._is_converted:
+            _ipex_llm_convert(load_in_low_bit)
+            cls._is_converted = True
         return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
                                         start_engine_loop=start_engine_loop,
                                         usage_context=usage_context, stat_loggers=stat_loggers)
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[dict[str, StatLoggerBase]]=None,
+        disable_log_requests: bool = False,
+        disable_log_stats: bool = False,
+        load_in_low_bit: str = "sym_int4",
+    ) -> "AsyncLLM":
+        if not cls._is_converted:
+            _ipex_llm_convert(load_in_low_bit)
+            cls._is_converted = True
+        return super().from_vllm_config(
+            vllm_config=vllm_config,
+            start_engine_loop=start_engine_loop,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+            disable_log_requests=disable_log_requests,
+            disable_log_stats=disable_log_stats,
+        )
 class IPEXLLMClass(LLM):
     def __init__(
         self,
         model: str,
@@ -94,20 +148,20 @@ class IPEXLLMClass(LLM):
         quantization: Optional[str] = None,
         revision: Optional[str] = None,
         tokenizer_revision: Optional[str] = None,
-        seed: int = 0,
+        seed: Optional[int] = None,
         gpu_memory_utilization: float = 0.9,
         swap_space: float = 4,
         cpu_offload_gb: float = 0,
         enforce_eager: Optional[bool] = None,
         max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
-        disable_async_output_proc: bool = True,
-        hf_overrides: Optional[HfOverrides] = None,
-        mm_processor_kwargs: Optional[Dict[str, Any]]=None,
+        disable_async_output_proc: bool = False,
+        hf_overrides: Optional[HfOverrides]=None,
+        mm_processor_kwargs: Optional[dict[str, Any]]=None,
         # After positional args are removed, move this right below `model`
         task: TaskOption = "auto",
         override_pooler_config: Optional[PoolerConfig] = None,
-        compilation_config: Optional[Union[int, Dict[str, Any]]]=None,
+        compilation_config: Optional[Union[int, dict[str, Any]]]=None,
         load_in_low_bit: str = "sym_int4",
         **kwargs,
     ) -> None:
@@ -120,6 +174,13 @@ class IPEXLLMClass(LLM):
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
+        if "worker_cls" in kwargs:
+            worker_cls = kwargs["worker_cls"]
+            # if the worker_cls is not qualified string name,
+            # we serialize it using cloudpickle to avoid pickling issues
+            if isinstance(worker_cls, type):
+                kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)
         if compilation_config is not None:
             if isinstance(compilation_config, (int, dict)):
                 compilation_config_instance = CompilationConfig.from_cli(
@@ -159,11 +220,13 @@ class IPEXLLMClass(LLM):
         # Logic to switch between engines is done at runtime instead of import
         # to avoid import order issues
         self.engine_class = self.get_engine_class()
+        # print("!!! ", load_in_low_bit)
         self.llm_engine = self.engine_class.from_engine_args(
             engine_args, usage_context=UsageContext.LLM_CLASS,
             load_in_low_bit=load_in_low_bit)
         self.request_counter = Counter()
+        self.default_sampling_params: Union[dict[str, Any], None] = None
     @staticmethod
     def get_engine_class() -> Type[LLMEngine]:
@@ -173,6 +236,8 @@ class IPEXLLMClass(LLM):
 class IPEXLLMLLMV1Engine(V1LLMEngine):
+    _is_converted = False
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -188,14 +253,37 @@ class IPEXLLMLLMV1Engine(V1LLMEngine):
         """Creates an LLM engine from the engine arguments."""
         # Create the engine configs.
-        _ipex_llm_convert(load_in_low_bit)
+        if not cls._is_converted:
+            _ipex_llm_convert(load_in_low_bit)
+            cls._is_converted = True
         return super().from_engine_args(engine_args,
                                         usage_context,
                                         stat_loggers,
                                         enable_multiprocessing)
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
+        disable_log_stats: bool = False,
+        load_in_low_bit: str = "sym_int4",
+    ) -> "LLMEngine":
+        if not cls._is_converted:
+            _ipex_llm_convert(load_in_low_bit)
+            cls._is_converted = True
+        return super().from_vllm_config(
+            vllm_config=vllm_config,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+            disable_log_stats=disable_log_stats
+        )
 class IPEXLLMLLMEngine(LLMEngine):
+    _is_converted = False
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -209,33 +297,89 @@ class IPEXLLMLLMEngine(LLMEngine):
     ) -> "LLMEngine":
         """Creates an LLM engine from the engine arguments."""
         # Create the engine configs.
-        _ipex_llm_convert(load_in_low_bit)
+        if not cls._is_converted:
+            _ipex_llm_convert(load_in_low_bit)
+            cls._is_converted = True
         return super().from_engine_args(engine_args, usage_context, stat_loggers)
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
+        disable_log_stats: bool = False,
+        load_in_low_bit: str = "sym_int4",
+    ) -> "LLMEngine":
+        if not cls._is_converted:
+            _ipex_llm_convert(load_in_low_bit)
+            cls._is_converted = True
+        return super().from_vllm_config(
+            vllm_config=vllm_config,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+            disable_log_stats=disable_log_stats
+        )
 class IPEXLLMMQLLMEngine(MQLLMEngine):
+    _is_converted = False
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
     @classmethod
     def from_engine_args(cls, engine_args: AsyncEngineArgs,
                          usage_context: UsageContext, ipc_path: str, load_in_low_bit: str):
-        _ipex_llm_convert(load_in_low_bit)
+        if not cls._is_converted:
+            _ipex_llm_convert(load_in_low_bit)
+            cls._is_converted = True
         return super().from_engine_args(engine_args, usage_context, ipc_path)
+    @classmethod
+    def from_vllm_config(cls, vllm_config: VllmConfig,
+                         usage_context: UsageContext,
+                         disable_log_requests: bool, disable_log_stats: bool,
+                         ipc_path: str, load_in_low_bit: str) -> "MQLLMEngine":
+        if not cls._is_converted:
+            _ipex_llm_convert(load_in_low_bit)
+            cls._is_converted = True
+        return super().from_vllm_config(
+            vllm_config=vllm_config,
+            ipc_path=ipc_path,
+            usage_context=usage_context,
+            disable_log_requests=disable_log_requests,
+            disable_log_stats=disable_log_stats,
+        )
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
-def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
-                  ipc_path: str, load_in_low_bit: str, engine_alive):
+def signal_handler(*_) -> None:
+    raise KeyboardInterrupt("MQLLMEngine terminated")  # noqa
-    def signal_handler(*_) -> None:
-        # Interrupt server on sigterm
-        raise KeyboardInterrupt("MQLLMEngine terminated")  # noqa
+def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
+                  ipc_path: str, disable_log_stats: bool,
+                  disable_log_requests: bool, load_in_low_bit, engine_alive):
     try:
+        # Ensure we can serialize transformer config before spawning
+        maybe_register_config_serialize_by_value()
+        engine = IPEXLLMMQLLMEngine.from_vllm_config(
+            vllm_config=vllm_config,
+            usage_context=usage_context,
+            disable_log_stats=disable_log_stats,
+            disable_log_requests=disable_log_requests,
+            load_in_low_bit=load_in_low_bit,
+            ipc_path=ipc_path)
         signal.signal(signal.SIGTERM, signal_handler)
-        engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
-                                                     usage_context=usage_context,
-                                                     ipc_path=ipc_path,
-                                                     load_in_low_bit=load_in_low_bit)
         engine.start()
     except BaseException as e:
         logger.exception(e)
         engine_alive.value = False