PyPI - ipex-llm - Versions diffs - 2.2.0b20250210__py3-none-win_amd64.whl → 2.2.0b20250212__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250210__py3-none-win_amd64.whl → 2.2.0b20250212__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/convert.py +23 -1
ipex_llm/transformers/low_bit_linear.py +1 -1
ipex_llm/transformers/models/baichuan_m1.py +240 -0
ipex_llm/transformers/models/janus.py +49 -0
ipex_llm/transformers/models/utils.py +1 -1
ipex_llm/vllm/xpu/engine/engine.py +117 -20
ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +379 -95
ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py +57 -8
ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py +23 -0
ipex_llm/vllm/xpu/model_convert.py +25 -19
{ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/METADATA +19 -19
{ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/RECORD +47 -44
{ipex_llm-2.2.0b20250210.data → ipex_llm-2.2.0b20250212.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250210.data → ipex_llm-2.2.0b20250212.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250210.data → ipex_llm-2.2.0b20250212.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/top_level.txt +0 -0

ipex_llm/libs/bloom-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/bloom.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama.dll CHANGED Viewed

Binary file

ipex_llm/libs/main-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/pipeline.dll CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/starcoder-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/starcoder.dll CHANGED Viewed

Binary file

ipex_llm/transformers/convert.py CHANGED Viewed

@@ -667,7 +667,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                             out_features,
                             mp_group,
                             None,
-                            None,
                             optimize_lm_head,
                             None
                         )
@@ -1062,6 +1061,11 @@ def _optimize_pre(model, qtype=None):
         from ipex_llm.transformers.models.glm import merge_qkv, split_mlp
         model.apply(merge_qkv)
         model.apply(split_mlp)
+    elif model.config.model_type == "baichuan_m1":
+        from ipex_llm.transformers.models.baichuan_m1 import pre_register_inv_freq
+        model.apply(pre_register_inv_freq)
+    elif model.config.model_type == "multi_modality":
+        _optimize_pre(model.language_model)
     return model
@@ -1994,5 +1998,23 @@ def _optimize_post(model):
         model.llm.config.rope_scaling = {"rope_type": "default"}
         _optimize_post(model.llm)
         model.llm.config.model_type = "megrezo"
+    elif model.config.model_type == "baichuan_m1":
+        modeling_module_name = model.__class__.__module__
+        module = importlib.import_module(modeling_module_name)
+        from ipex_llm.transformers.models.common import rms_norm_forward
+        from ipex_llm.transformers.models.baichuan_m1 import model_forward
+        from ipex_llm.transformers.models.baichuan_m1 import eager_attention_forward
+        convert_forward(model, module.BaichuanModel, model_forward)
+        convert_forward(model, module.BaichuanRMSNorm, rms_norm_forward)
+        convert_forward(model, module.BaichuanAttention, eager_attention_forward)
+    elif model.config.model_type == "multi_modality":
+        # vision
+        vpm_modeling_module_name = model.vision_model.vision_tower.__class__.__module__
+        vpm_module = importlib.import_module(vpm_modeling_module_name)
+        from ipex_llm.transformers.models.janus import vision_attention_forward
+        convert_forward(model.vision_model, vpm_module.Attention, vision_attention_forward)
+        # llm
+        _optimize_post(model.language_model)
     return model

ipex_llm/transformers/low_bit_linear.py CHANGED Viewed

@@ -699,7 +699,7 @@ class LowBitLinear(nn.Linear):
                 if is_server() and (not is_spr()) and \
                         self.qtype == SYM_INT4 and x_2d.shape[0] >= TORCH_LINEAR_THRESHOLD:
                     x0_fp32 = ggml_int4_convert_fp32(x0, self.weight_shape, self.weight_length)
-                    result = F.linear(x, x0_fp32)
+                    result = F.linear(x.to(dtype=x0_fp32.dtype), x0_fp32)
                 else:
                     # Weight does not need a convert
                     result = ggml_matmul_src1_x_src0_t(x0, x_2d, self.weight_shape, self.qtype)

ipex_llm/transformers/models/baichuan_m1.py ADDED Viewed

@@ -0,0 +1,240 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is adapted from
+# https://huggingface.co/baichuan-inc/Baichuan-M1-14B-Instruct/blob/main/modeling_baichuan.py
+import math
+import torch
+import torch.nn.functional as F
+from typing import Optional, Tuple, Union
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from ipex_llm.utils.common import invalidInputError
+from ipex_llm.transformers.models.utils import should_use_fuse_rope, repeat_kv
+from ipex_llm.transformers.models.common import attention_softmax
+from ipex_llm.transformers.models.common import scaled_dot_product_attention
+from ipex_llm.transformers.kv import DynamicNormalCache
+def pre_register_inv_freq(module: torch.nn.Module):
+    if module.__class__.__name__ == "RotaryEmbedding":
+        inv_freq = module.inv_freq
+        del module.inv_freq
+        module.register_buffer("inv_freq", inv_freq, persistent=False)
+# copied from Baichuan M1
+def custom_convolution(U, K):
+    """
+    U: Input matrix, shape (bs, seq, h, d)
+    K: Convolution kernel, shape (w, h)
+    Returns: Output matrix V, shape (bs, seq, h, d)
+    """
+    # h, w = K.shape
+    w = K.size(-1)
+    padding = (w - 1, 0)
+    U_padded = F.pad(U, (0, 0, 0, 0, *padding))  # Shape becomes (bs, seq+w-1, h, d)
+    U_unfolded = U_padded.unfold(1, w, 1)  # Shape becomes (bs, seq+w-1, h, d, w)
+    V_unfolded = U_unfolded * K  # Shape remains (bs, seq, h, d, w)
+    V = V_unfolded.sum(dim=-1)  # Shape becomes (bs, seq, h, d)
+    return V
+def model_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    seqlens: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Cache] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    output_attentions = (
+        output_attentions if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    invalidInputError((input_ids is None) ^ (inputs_embeds is None),
+                      "You cannot specify both input_ids and inputs_embeds at the same time, "
+                      "and must specify either one")
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids)
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    use_cache = True if inputs_embeds.device.type == "xpu" else use_cache
+    # IPEX-LLM changes start: remove batch multi-pack and use ipex-llm's kv cache
+    # kept for BC (non `Cache` `past_key_values` inputs)
+    if use_cache and not isinstance(past_key_values, DynamicNormalCache):
+        past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values)
+    # IPEX-LLM changes end
+    if cache_position is None:
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        cache_position = torch.arange(
+            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1],
+            device=inputs_embeds.device
+        )
+    if position_ids is None:
+        position_ids = cache_position.unsqueeze(0)
+    causal_mask = self._update_causal_mask(
+        attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+    )
+    hidden_states = inputs_embeds
+    # create position embeddings to be shared across the decoder layers
+    # position_embeddings = self.rotary_emb(hidden_states, position_ids)
+    position_embeddings = None
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    next_decoder_cache = None
+    for decoder_layer in self.layers:
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        layer_outputs = decoder_layer(
+            hidden_states,
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            seqlens=None,
+            past_key_value=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = layer_outputs[0]
+        if use_cache:
+            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+        if output_attentions:
+            all_self_attns += (layer_outputs[1],)
+    hidden_states = self.norm(hidden_states)
+    # add hidden states from the last decoder layer
+    if output_hidden_states:
+        all_hidden_states += (hidden_states,)
+    next_cache = next_decoder_cache if use_cache else None
+    if not return_dict:
+        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                     if v is not None)
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+    )
+def eager_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    seqlens: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Cache] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    cache_position: Optional[torch.LongTensor] = None,
+    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]]=None,
+):
+    invalidInputError(seqlens is None, "`seq_lens` must be None")
+    bsz, q_len, _ = hidden_states.size()
+    qkv = self.W_pack(hidden_states)
+    qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim)
+    query_states, key_states, value_states = qkv.split([self.num_heads,
+                                                        self.num_key_value_heads,
+                                                        self.num_key_value_heads], dim=2)
+    # q, k, v: [bsz, seq_len, num_heads, head_dim]
+    if past_key_value is None or past_key_value.get_seq_length(self.layer_idx) == 0:    # prefill
+        self.last_k = key_states[:, -1:]
+        self.last_v = value_states[:, -1:]
+        key_states = custom_convolution(key_states, self.conv_k)
+        value_states = custom_convolution(value_states, self.conv_v)
+    else:
+        new_key_states = (self.conv_k[0, 0, :, 0, :1] * self.last_k +
+                          self.conv_k[0, 0, :, 0, 1:] * key_states)
+        self.last_k = key_states
+        key_states = new_key_states
+        new_value_states = (self.conv_v[0, 0, :, 0, : 1] * self.last_v +
+                            self.conv_v[0, 0, :, 0, 1:] * value_states)
+        self.last_v = value_states
+        value_states = new_value_states
+    query_states = query_states.transpose(1, 2)
+    key_states = key_states.transpose(1, 2)
+    value_states = value_states.transpose(1, 2)
+    # q, k, v: [bsz, num_heads, seq_len, head_dim]
+    invalidInputError(should_use_fuse_rope(hidden_states, position_ids, self.training),
+                      "fuse rope must be used")
+    import xe_addons
+    xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids,
+                                   query_states, key_states)
+    # ignore sliding window
+    key_states, value_states = past_key_value.update(key_states, value_states,
+                                                     self.layer_idx, None)
+    if self.head_dim <= 128:
+        attn_weights = None
+        attn_output = scaled_dot_product_attention(
+            query_states, key_states, value_states,
+            attention_mask, q_len == key_states.size(2)
+        )
+    else:
+        n_rep = self.num_heads // self.num_key_value_heads
+        key_states = repeat_kv(key_states, n_rep)
+        value_states = repeat_kv(value_states, n_rep)
+        attn_weights = torch.matmul(query_states,
+                                    key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        attn_weights = attention_softmax(attn_weights)
+        attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+    if not output_attentions:
+        attn_weights = None
+    return attn_output, attn_weights, past_key_value

ipex_llm/transformers/models/janus.py ADDED Viewed

@@ -0,0 +1,49 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is adapted from
+# https://github.com/deepseek-ai/Janus/blob/main/janus/models/siglip_vit.py
+import torch
+from ipex_llm.transformers.models.common import scaled_dot_product_attention
+def vision_attention_forward(self, x: torch.Tensor) -> torch.Tensor:
+    B, N, C = x.shape
+    qkv = (
+        self.qkv(x)
+        .reshape(B, N, 3, self.num_heads, self.head_dim)
+        .permute(2, 0, 3, 1, 4)
+    )
+    q, k, v = qkv.unbind(0)
+    q, k = self.q_norm(q), self.k_norm(k)
+    if self.fused_attn:
+        # ipex-llm opt: sdpa
+        x = scaled_dot_product_attention(
+            q, k.contiguous(), v.contiguous(), None, False
+        )
+    else:
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = attn @ v
+    x = x.transpose(1, 2).reshape(B, N, C)
+    x = self.proj(x)
+    x = self.proj_drop(x)
+    return x

ipex_llm/transformers/models/utils.py CHANGED Viewed

@@ -86,7 +86,7 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
         return os.environ["IPEX_LLM_QUANTIZE_KV_CACHE"] == "1"
     elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
         return os.environ["IPEX_LLM_LOW_MEM"] == "1"
-    elif linear.qtype in [ggml_tensor_qtype["fp16"], ggml_tensor_qtype["bf16"]]:
+    elif linear.weight.dtype != torch.uint8:    # unquantized
         return False
     else:
         device_name = get_xpu_device_name(x.device)

ipex_llm/vllm/xpu/engine/engine.py CHANGED Viewed

@@ -13,18 +13,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from typing import Dict, Optional
+from vllm.logger import init_logger
+from typing import Dict, Optional, Any, Union, Type
 from vllm.engine.llm_engine import LLMEngine
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.llm import LLM
 from vllm.utils import Counter
-from vllm.config import EngineConfig
+from vllm.config import VllmConfig
 from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert
 from vllm.usage.usage_lib import UsageContext
 from vllm.engine.metrics import StatLoggerBase
 from vllm.engine.multiprocessing.engine import MQLLMEngine
 import signal
+from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
+                                   TaskOption)
+from vllm.config import CompilationConfig
+from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+from vllm import envs
+from vllm.v1.engine.async_llm import AsyncLLM
+import os
+logger = init_logger(__name__)
 class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
@@ -35,7 +45,7 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
     def from_engine_args(
         cls,
         engine_args: AsyncEngineArgs,
-        engine_config: Optional[EngineConfig] = None,
+        engine_config: Optional[VllmConfig] = None,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         load_in_low_bit: str = "sym_int4",
@@ -49,6 +59,27 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
                                         usage_context=usage_context, stat_loggers=stat_loggers)
+class IPEXLLMAsyncV1Engine(AsyncLLM):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: AsyncEngineArgs,
+        engine_config: Optional[VllmConfig] = None,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        load_in_low_bit: str = "sym_int4",
+        stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,  # noqa
+    ) -> "AsyncLLM":
+        _ipex_llm_convert(load_in_low_bit)
+        return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
+                                        start_engine_loop=start_engine_loop,
+                                        usage_context=usage_context, stat_loggers=stat_loggers)
 class IPEXLLMClass(LLM):
     def __init__(
         self,
@@ -57,6 +88,7 @@ class IPEXLLMClass(LLM):
         tokenizer_mode: str = "auto",
         skip_tokenizer_init: bool = False,
         trust_remote_code: bool = False,
+        allowed_local_media_path: str = "",
         tensor_parallel_size: int = 1,
         dtype: str = "auto",
         quantization: Optional[str] = None,
@@ -64,28 +96,48 @@ class IPEXLLMClass(LLM):
         tokenizer_revision: Optional[str] = None,
         seed: int = 0,
         gpu_memory_utilization: float = 0.9,
-        swap_space: int = 4,
+        swap_space: float = 4,
         cpu_offload_gb: float = 0,
-        enforce_eager: bool = False,
-        max_context_len_to_capture: Optional[int] = None,
+        enforce_eager: Optional[bool] = None,
         max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
+        disable_async_output_proc: bool = True,
+        hf_overrides: Optional[HfOverrides] = None,
+        mm_processor_kwargs: Optional[Dict[str, Any]]=None,
+        # After positional args are removed, move this right below `model`
+        task: TaskOption = "auto",
+        override_pooler_config: Optional[PoolerConfig] = None,
+        compilation_config: Optional[Union[int, Dict[str, Any]]]=None,
         load_in_low_bit: str = "sym_int4",
         **kwargs,
     ) -> None:
+        '''
+        LLM constructor.
+        Note: if enforce_eager is unset (enforce_eager is None)
+        it defaults to False.
+        '''
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
-        removed_vision_keys = ("image_token_id", "image_feature_size",
-                               "image_input_shape", "image_input_type")
-        if any(k in kwargs for k in removed_vision_keys):
-            raise TypeError(  # noqa
-                "There is no need to pass vision-related arguments anymore.")
+        if compilation_config is not None:
+            if isinstance(compilation_config, (int, dict)):
+                compilation_config_instance = CompilationConfig.from_cli(
+                    str(compilation_config))
+            else:
+                compilation_config_instance = compilation_config
+        else:
+            compilation_config_instance = None
         engine_args = EngineArgs(
             model=model,
+            task=task,
             tokenizer=tokenizer,
             tokenizer_mode=tokenizer_mode,
             skip_tokenizer_init=skip_tokenizer_init,
             trust_remote_code=trust_remote_code,
+            allowed_local_media_path=allowed_local_media_path,
             tensor_parallel_size=tensor_parallel_size,
             dtype=dtype,
             quantization=quantization,
@@ -96,16 +148,53 @@ class IPEXLLMClass(LLM):
             swap_space=swap_space,
             cpu_offload_gb=cpu_offload_gb,
             enforce_eager=enforce_eager,
-            max_context_len_to_capture=max_context_len_to_capture,
             max_seq_len_to_capture=max_seq_len_to_capture,
             disable_custom_all_reduce=disable_custom_all_reduce,
+            disable_async_output_proc=disable_async_output_proc,
+            hf_overrides=hf_overrides,
+            mm_processor_kwargs=mm_processor_kwargs,
+            override_pooler_config=override_pooler_config,
+            compilation_config=compilation_config_instance,
             **kwargs,
         )
-        self.llm_engine = IPEXLLMLLMEngine.from_engine_args(
+        # Logic to switch between engines is done at runtime instead of import
+        # to avoid import order issues
+        self.engine_class = self.get_engine_class()
+        self.llm_engine = self.engine_class.from_engine_args(
             engine_args, usage_context=UsageContext.LLM_CLASS,
             load_in_low_bit=load_in_low_bit)
         self.request_counter = Counter()
+    @staticmethod
+    def get_engine_class() -> Type[LLMEngine]:
+        if envs.VLLM_USE_V1:
+            return IPEXLLMLLMV1Engine
+        return IPEXLLMLLMEngine
+class IPEXLLMLLMV1Engine(V1LLMEngine):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: EngineArgs,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
+        enable_multiprocessing: bool = False,
+        load_in_low_bit: str = "sym_int4",
+    ) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+        # Create the engine configs.
+        _ipex_llm_convert(load_in_low_bit)
+        return super().from_engine_args(engine_args,
+                                        usage_context,
+                                        stat_loggers,
+                                        enable_multiprocessing)
 class IPEXLLMLLMEngine(LLMEngine):
     def __init__(self, *args, **kwargs):
@@ -134,16 +223,24 @@ class IPEXLLMMQLLMEngine(MQLLMEngine):
 def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
-                  ipc_path: str, load_in_low_bit: str):
+                  ipc_path: str, load_in_low_bit: str, engine_alive):
     def signal_handler(*_) -> None:
         # Interrupt server on sigterm
         raise KeyboardInterrupt("MQLLMEngine terminated")  # noqa
-    signal.signal(signal.SIGTERM, signal_handler)
+    try:
+        signal.signal(signal.SIGTERM, signal_handler)
+        engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
+                                                     usage_context=usage_context,
+                                                     ipc_path=ipc_path,
+                                                     load_in_low_bit=load_in_low_bit)
+        engine.start()
+    except BaseException as e:
+        logger.exception(e)
+        engine_alive.value = False
+        raise e  # noqa
-    engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
-                                                 usage_context=usage_context,
-                                                 ipc_path=ipc_path,
-                                                 load_in_low_bit=load_in_low_bit)
-    engine.start()
+if os.getenv("VLLM_USE_V1"):
+    IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine