ipex-llm 2.2.0b20250210__py3-none-win_amd64.whl → 2.2.0b20250212__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/convert.py +23 -1
  31. ipex_llm/transformers/low_bit_linear.py +1 -1
  32. ipex_llm/transformers/models/baichuan_m1.py +240 -0
  33. ipex_llm/transformers/models/janus.py +49 -0
  34. ipex_llm/transformers/models/utils.py +1 -1
  35. ipex_llm/vllm/xpu/engine/engine.py +117 -20
  36. ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +379 -95
  37. ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py +57 -8
  38. ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py +23 -0
  39. ipex_llm/vllm/xpu/model_convert.py +25 -19
  40. {ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/METADATA +19 -19
  41. {ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/RECORD +47 -44
  42. {ipex_llm-2.2.0b20250210.data → ipex_llm-2.2.0b20250212.data}/scripts/ipex-llm-init.bat +0 -0
  43. {ipex_llm-2.2.0b20250210.data → ipex_llm-2.2.0b20250212.data}/scripts/llm-chat.ps1 +0 -0
  44. {ipex_llm-2.2.0b20250210.data → ipex_llm-2.2.0b20250212.data}/scripts/llm-cli.ps1 +0 -0
  45. {ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/WHEEL +0 -0
  46. {ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/entry_points.txt +0 -0
  47. {ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/top_level.txt +0 -0
Binary file
ipex_llm/libs/bloom.dll CHANGED
Binary file
Binary file
ipex_llm/libs/gptneox.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
ipex_llm/libs/llama.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -667,7 +667,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
667
667
  out_features,
668
668
  mp_group,
669
669
  None,
670
- None,
671
670
  optimize_lm_head,
672
671
  None
673
672
  )
@@ -1062,6 +1061,11 @@ def _optimize_pre(model, qtype=None):
1062
1061
  from ipex_llm.transformers.models.glm import merge_qkv, split_mlp
1063
1062
  model.apply(merge_qkv)
1064
1063
  model.apply(split_mlp)
1064
+ elif model.config.model_type == "baichuan_m1":
1065
+ from ipex_llm.transformers.models.baichuan_m1 import pre_register_inv_freq
1066
+ model.apply(pre_register_inv_freq)
1067
+ elif model.config.model_type == "multi_modality":
1068
+ _optimize_pre(model.language_model)
1065
1069
 
1066
1070
  return model
1067
1071
 
@@ -1994,5 +1998,23 @@ def _optimize_post(model):
1994
1998
  model.llm.config.rope_scaling = {"rope_type": "default"}
1995
1999
  _optimize_post(model.llm)
1996
2000
  model.llm.config.model_type = "megrezo"
2001
+ elif model.config.model_type == "baichuan_m1":
2002
+ modeling_module_name = model.__class__.__module__
2003
+ module = importlib.import_module(modeling_module_name)
2004
+ from ipex_llm.transformers.models.common import rms_norm_forward
2005
+ from ipex_llm.transformers.models.baichuan_m1 import model_forward
2006
+ from ipex_llm.transformers.models.baichuan_m1 import eager_attention_forward
2007
+ convert_forward(model, module.BaichuanModel, model_forward)
2008
+ convert_forward(model, module.BaichuanRMSNorm, rms_norm_forward)
2009
+ convert_forward(model, module.BaichuanAttention, eager_attention_forward)
2010
+ elif model.config.model_type == "multi_modality":
2011
+ # vision
2012
+ vpm_modeling_module_name = model.vision_model.vision_tower.__class__.__module__
2013
+ vpm_module = importlib.import_module(vpm_modeling_module_name)
2014
+ from ipex_llm.transformers.models.janus import vision_attention_forward
2015
+ convert_forward(model.vision_model, vpm_module.Attention, vision_attention_forward)
2016
+
2017
+ # llm
2018
+ _optimize_post(model.language_model)
1997
2019
 
1998
2020
  return model
@@ -699,7 +699,7 @@ class LowBitLinear(nn.Linear):
699
699
  if is_server() and (not is_spr()) and \
700
700
  self.qtype == SYM_INT4 and x_2d.shape[0] >= TORCH_LINEAR_THRESHOLD:
701
701
  x0_fp32 = ggml_int4_convert_fp32(x0, self.weight_shape, self.weight_length)
702
- result = F.linear(x, x0_fp32)
702
+ result = F.linear(x.to(dtype=x0_fp32.dtype), x0_fp32)
703
703
  else:
704
704
  # Weight does not need a convert
705
705
  result = ggml_matmul_src1_x_src0_t(x0, x_2d, self.weight_shape, self.qtype)
@@ -0,0 +1,240 @@
1
+ #
2
+ # Copyright 2016 The BigDL Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # This file is adapted from
17
+ # https://huggingface.co/baichuan-inc/Baichuan-M1-14B-Instruct/blob/main/modeling_baichuan.py
18
+
19
+
20
+ import math
21
+ import torch
22
+ import torch.nn.functional as F
23
+
24
+ from typing import Optional, Tuple, Union
25
+ from transformers.cache_utils import Cache
26
+ from transformers.modeling_outputs import BaseModelOutputWithPast
27
+ from ipex_llm.utils.common import invalidInputError
28
+ from ipex_llm.transformers.models.utils import should_use_fuse_rope, repeat_kv
29
+ from ipex_llm.transformers.models.common import attention_softmax
30
+ from ipex_llm.transformers.models.common import scaled_dot_product_attention
31
+ from ipex_llm.transformers.kv import DynamicNormalCache
32
+
33
+
34
+ def pre_register_inv_freq(module: torch.nn.Module):
35
+ if module.__class__.__name__ == "RotaryEmbedding":
36
+ inv_freq = module.inv_freq
37
+ del module.inv_freq
38
+ module.register_buffer("inv_freq", inv_freq, persistent=False)
39
+
40
+
41
+ # copied from Baichuan M1
42
+ def custom_convolution(U, K):
43
+ """
44
+ U: Input matrix, shape (bs, seq, h, d)
45
+ K: Convolution kernel, shape (w, h)
46
+ Returns: Output matrix V, shape (bs, seq, h, d)
47
+ """
48
+ # h, w = K.shape
49
+ w = K.size(-1)
50
+ padding = (w - 1, 0)
51
+ U_padded = F.pad(U, (0, 0, 0, 0, *padding)) # Shape becomes (bs, seq+w-1, h, d)
52
+ U_unfolded = U_padded.unfold(1, w, 1) # Shape becomes (bs, seq+w-1, h, d, w)
53
+ V_unfolded = U_unfolded * K # Shape remains (bs, seq, h, d, w)
54
+ V = V_unfolded.sum(dim=-1) # Shape becomes (bs, seq, h, d)
55
+ return V
56
+
57
+
58
+ def model_forward(
59
+ self,
60
+ input_ids: torch.LongTensor = None,
61
+ attention_mask: Optional[torch.Tensor] = None,
62
+ position_ids: Optional[torch.LongTensor] = None,
63
+ seqlens: Optional[torch.LongTensor] = None,
64
+ past_key_values: Optional[Cache] = None,
65
+ inputs_embeds: Optional[torch.FloatTensor] = None,
66
+ use_cache: Optional[bool] = None,
67
+ output_attentions: Optional[bool] = None,
68
+ output_hidden_states: Optional[bool] = None,
69
+ return_dict: Optional[bool] = None,
70
+ cache_position: Optional[torch.LongTensor] = None,
71
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
72
+ output_attentions = (
73
+ output_attentions if output_attentions is not None
74
+ else self.config.output_attentions
75
+ )
76
+ output_hidden_states = (
77
+ output_hidden_states if output_hidden_states is not None
78
+ else self.config.output_hidden_states
79
+ )
80
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
81
+
82
+ invalidInputError((input_ids is None) ^ (inputs_embeds is None),
83
+ "You cannot specify both input_ids and inputs_embeds at the same time, "
84
+ "and must specify either one")
85
+
86
+ if inputs_embeds is None:
87
+ inputs_embeds = self.embed_tokens(input_ids)
88
+
89
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
90
+ use_cache = True if inputs_embeds.device.type == "xpu" else use_cache
91
+
92
+ # IPEX-LLM changes start: remove batch multi-pack and use ipex-llm's kv cache
93
+ # kept for BC (non `Cache` `past_key_values` inputs)
94
+ if use_cache and not isinstance(past_key_values, DynamicNormalCache):
95
+ past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values)
96
+ # IPEX-LLM changes end
97
+
98
+ if cache_position is None:
99
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
100
+ cache_position = torch.arange(
101
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1],
102
+ device=inputs_embeds.device
103
+ )
104
+ if position_ids is None:
105
+ position_ids = cache_position.unsqueeze(0)
106
+
107
+ causal_mask = self._update_causal_mask(
108
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
109
+ )
110
+
111
+ hidden_states = inputs_embeds
112
+
113
+ # create position embeddings to be shared across the decoder layers
114
+ # position_embeddings = self.rotary_emb(hidden_states, position_ids)
115
+ position_embeddings = None
116
+
117
+ # decoder layers
118
+ all_hidden_states = () if output_hidden_states else None
119
+ all_self_attns = () if output_attentions else None
120
+ next_decoder_cache = None
121
+
122
+ for decoder_layer in self.layers:
123
+ if output_hidden_states:
124
+ all_hidden_states += (hidden_states,)
125
+
126
+ layer_outputs = decoder_layer(
127
+ hidden_states,
128
+ attention_mask=causal_mask,
129
+ position_ids=position_ids,
130
+ seqlens=None,
131
+ past_key_value=past_key_values,
132
+ output_attentions=output_attentions,
133
+ use_cache=use_cache,
134
+ cache_position=cache_position,
135
+ position_embeddings=position_embeddings,
136
+ )
137
+
138
+ hidden_states = layer_outputs[0]
139
+ if use_cache:
140
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
141
+
142
+ if output_attentions:
143
+ all_self_attns += (layer_outputs[1],)
144
+
145
+ hidden_states = self.norm(hidden_states)
146
+
147
+ # add hidden states from the last decoder layer
148
+ if output_hidden_states:
149
+ all_hidden_states += (hidden_states,)
150
+
151
+ next_cache = next_decoder_cache if use_cache else None
152
+ if not return_dict:
153
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
154
+ if v is not None)
155
+ return BaseModelOutputWithPast(
156
+ last_hidden_state=hidden_states,
157
+ past_key_values=next_cache,
158
+ hidden_states=all_hidden_states,
159
+ attentions=all_self_attns,
160
+ )
161
+
162
+
163
+ def eager_attention_forward(
164
+ self,
165
+ hidden_states: torch.Tensor,
166
+ attention_mask: Optional[torch.Tensor] = None,
167
+ position_ids: Optional[torch.LongTensor] = None,
168
+ seqlens: Optional[torch.LongTensor] = None,
169
+ past_key_value: Optional[Cache] = None,
170
+ output_attentions: bool = False,
171
+ use_cache: bool = False,
172
+ cache_position: Optional[torch.LongTensor] = None,
173
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]]=None,
174
+ ):
175
+ invalidInputError(seqlens is None, "`seq_lens` must be None")
176
+
177
+ bsz, q_len, _ = hidden_states.size()
178
+ qkv = self.W_pack(hidden_states)
179
+ qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim)
180
+ query_states, key_states, value_states = qkv.split([self.num_heads,
181
+ self.num_key_value_heads,
182
+ self.num_key_value_heads], dim=2)
183
+ # q, k, v: [bsz, seq_len, num_heads, head_dim]
184
+
185
+ if past_key_value is None or past_key_value.get_seq_length(self.layer_idx) == 0: # prefill
186
+ self.last_k = key_states[:, -1:]
187
+ self.last_v = value_states[:, -1:]
188
+
189
+ key_states = custom_convolution(key_states, self.conv_k)
190
+ value_states = custom_convolution(value_states, self.conv_v)
191
+ else:
192
+ new_key_states = (self.conv_k[0, 0, :, 0, :1] * self.last_k +
193
+ self.conv_k[0, 0, :, 0, 1:] * key_states)
194
+ self.last_k = key_states
195
+ key_states = new_key_states
196
+
197
+ new_value_states = (self.conv_v[0, 0, :, 0, : 1] * self.last_v +
198
+ self.conv_v[0, 0, :, 0, 1:] * value_states)
199
+ self.last_v = value_states
200
+ value_states = new_value_states
201
+
202
+ query_states = query_states.transpose(1, 2)
203
+ key_states = key_states.transpose(1, 2)
204
+ value_states = value_states.transpose(1, 2)
205
+ # q, k, v: [bsz, num_heads, seq_len, head_dim]
206
+
207
+ invalidInputError(should_use_fuse_rope(hidden_states, position_ids, self.training),
208
+ "fuse rope must be used")
209
+ import xe_addons
210
+ xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids,
211
+ query_states, key_states)
212
+
213
+ # ignore sliding window
214
+ key_states, value_states = past_key_value.update(key_states, value_states,
215
+ self.layer_idx, None)
216
+ if self.head_dim <= 128:
217
+ attn_weights = None
218
+ attn_output = scaled_dot_product_attention(
219
+ query_states, key_states, value_states,
220
+ attention_mask, q_len == key_states.size(2)
221
+ )
222
+ else:
223
+ n_rep = self.num_heads // self.num_key_value_heads
224
+ key_states = repeat_kv(key_states, n_rep)
225
+ value_states = repeat_kv(value_states, n_rep)
226
+ attn_weights = torch.matmul(query_states,
227
+ key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
228
+ if attention_mask is not None:
229
+ attn_weights = attn_weights + attention_mask
230
+ attn_weights = attention_softmax(attn_weights)
231
+ attn_output = torch.matmul(attn_weights, value_states)
232
+
233
+ attn_output = attn_output.transpose(1, 2).contiguous()
234
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
235
+
236
+ attn_output = self.o_proj(attn_output)
237
+
238
+ if not output_attentions:
239
+ attn_weights = None
240
+ return attn_output, attn_weights, past_key_value
@@ -0,0 +1,49 @@
1
+ #
2
+ # Copyright 2016 The BigDL Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # This file is adapted from
17
+ # https://github.com/deepseek-ai/Janus/blob/main/janus/models/siglip_vit.py
18
+
19
+ import torch
20
+
21
+ from ipex_llm.transformers.models.common import scaled_dot_product_attention
22
+
23
+
24
+ def vision_attention_forward(self, x: torch.Tensor) -> torch.Tensor:
25
+ B, N, C = x.shape
26
+ qkv = (
27
+ self.qkv(x)
28
+ .reshape(B, N, 3, self.num_heads, self.head_dim)
29
+ .permute(2, 0, 3, 1, 4)
30
+ )
31
+ q, k, v = qkv.unbind(0)
32
+ q, k = self.q_norm(q), self.k_norm(k)
33
+
34
+ if self.fused_attn:
35
+ # ipex-llm opt: sdpa
36
+ x = scaled_dot_product_attention(
37
+ q, k.contiguous(), v.contiguous(), None, False
38
+ )
39
+ else:
40
+ q = q * self.scale
41
+ attn = q @ k.transpose(-2, -1)
42
+ attn = attn.softmax(dim=-1)
43
+ attn = self.attn_drop(attn)
44
+ x = attn @ v
45
+
46
+ x = x.transpose(1, 2).reshape(B, N, C)
47
+ x = self.proj(x)
48
+ x = self.proj_drop(x)
49
+ return x
@@ -86,7 +86,7 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
86
86
  return os.environ["IPEX_LLM_QUANTIZE_KV_CACHE"] == "1"
87
87
  elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
88
88
  return os.environ["IPEX_LLM_LOW_MEM"] == "1"
89
- elif linear.qtype in [ggml_tensor_qtype["fp16"], ggml_tensor_qtype["bf16"]]:
89
+ elif linear.weight.dtype != torch.uint8: # unquantized
90
90
  return False
91
91
  else:
92
92
  device_name = get_xpu_device_name(x.device)
@@ -13,18 +13,28 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
  #
16
- from typing import Dict, Optional
16
+ from vllm.logger import init_logger
17
+ from typing import Dict, Optional, Any, Union, Type
17
18
  from vllm.engine.llm_engine import LLMEngine
18
19
  from vllm.engine.async_llm_engine import AsyncLLMEngine
19
20
  from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
20
21
  from vllm.entrypoints.llm import LLM
21
22
  from vllm.utils import Counter
22
- from vllm.config import EngineConfig
23
+ from vllm.config import VllmConfig
23
24
  from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert
24
25
  from vllm.usage.usage_lib import UsageContext
25
26
  from vllm.engine.metrics import StatLoggerBase
26
27
  from vllm.engine.multiprocessing.engine import MQLLMEngine
27
28
  import signal
29
+ from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
30
+ TaskOption)
31
+ from vllm.config import CompilationConfig
32
+ from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
33
+ from vllm import envs
34
+ from vllm.v1.engine.async_llm import AsyncLLM
35
+ import os
36
+
37
+ logger = init_logger(__name__)
28
38
 
29
39
 
30
40
  class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
@@ -35,7 +45,7 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
35
45
  def from_engine_args(
36
46
  cls,
37
47
  engine_args: AsyncEngineArgs,
38
- engine_config: Optional[EngineConfig] = None,
48
+ engine_config: Optional[VllmConfig] = None,
39
49
  start_engine_loop: bool = True,
40
50
  usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
41
51
  load_in_low_bit: str = "sym_int4",
@@ -49,6 +59,27 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
49
59
  usage_context=usage_context, stat_loggers=stat_loggers)
50
60
 
51
61
 
62
+ class IPEXLLMAsyncV1Engine(AsyncLLM):
63
+
64
+ def __init__(self, *args, **kwargs):
65
+ super().__init__(*args, **kwargs)
66
+
67
+ @classmethod
68
+ def from_engine_args(
69
+ cls,
70
+ engine_args: AsyncEngineArgs,
71
+ engine_config: Optional[VllmConfig] = None,
72
+ start_engine_loop: bool = True,
73
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
74
+ load_in_low_bit: str = "sym_int4",
75
+ stat_loggers: Optional[Dict[str, StatLoggerBase]]=None, # noqa
76
+ ) -> "AsyncLLM":
77
+ _ipex_llm_convert(load_in_low_bit)
78
+ return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
79
+ start_engine_loop=start_engine_loop,
80
+ usage_context=usage_context, stat_loggers=stat_loggers)
81
+
82
+
52
83
  class IPEXLLMClass(LLM):
53
84
  def __init__(
54
85
  self,
@@ -57,6 +88,7 @@ class IPEXLLMClass(LLM):
57
88
  tokenizer_mode: str = "auto",
58
89
  skip_tokenizer_init: bool = False,
59
90
  trust_remote_code: bool = False,
91
+ allowed_local_media_path: str = "",
60
92
  tensor_parallel_size: int = 1,
61
93
  dtype: str = "auto",
62
94
  quantization: Optional[str] = None,
@@ -64,28 +96,48 @@ class IPEXLLMClass(LLM):
64
96
  tokenizer_revision: Optional[str] = None,
65
97
  seed: int = 0,
66
98
  gpu_memory_utilization: float = 0.9,
67
- swap_space: int = 4,
99
+ swap_space: float = 4,
68
100
  cpu_offload_gb: float = 0,
69
- enforce_eager: bool = False,
70
- max_context_len_to_capture: Optional[int] = None,
101
+ enforce_eager: Optional[bool] = None,
71
102
  max_seq_len_to_capture: int = 8192,
72
103
  disable_custom_all_reduce: bool = False,
104
+ disable_async_output_proc: bool = True,
105
+ hf_overrides: Optional[HfOverrides] = None,
106
+ mm_processor_kwargs: Optional[Dict[str, Any]]=None,
107
+ # After positional args are removed, move this right below `model`
108
+ task: TaskOption = "auto",
109
+ override_pooler_config: Optional[PoolerConfig] = None,
110
+ compilation_config: Optional[Union[int, Dict[str, Any]]]=None,
73
111
  load_in_low_bit: str = "sym_int4",
74
112
  **kwargs,
75
113
  ) -> None:
114
+ '''
115
+ LLM constructor.
116
+
117
+ Note: if enforce_eager is unset (enforce_eager is None)
118
+ it defaults to False.
119
+ '''
120
+
76
121
  if "disable_log_stats" not in kwargs:
77
122
  kwargs["disable_log_stats"] = True
78
- removed_vision_keys = ("image_token_id", "image_feature_size",
79
- "image_input_shape", "image_input_type")
80
- if any(k in kwargs for k in removed_vision_keys):
81
- raise TypeError( # noqa
82
- "There is no need to pass vision-related arguments anymore.")
123
+
124
+ if compilation_config is not None:
125
+ if isinstance(compilation_config, (int, dict)):
126
+ compilation_config_instance = CompilationConfig.from_cli(
127
+ str(compilation_config))
128
+ else:
129
+ compilation_config_instance = compilation_config
130
+ else:
131
+ compilation_config_instance = None
132
+
83
133
  engine_args = EngineArgs(
84
134
  model=model,
135
+ task=task,
85
136
  tokenizer=tokenizer,
86
137
  tokenizer_mode=tokenizer_mode,
87
138
  skip_tokenizer_init=skip_tokenizer_init,
88
139
  trust_remote_code=trust_remote_code,
140
+ allowed_local_media_path=allowed_local_media_path,
89
141
  tensor_parallel_size=tensor_parallel_size,
90
142
  dtype=dtype,
91
143
  quantization=quantization,
@@ -96,16 +148,53 @@ class IPEXLLMClass(LLM):
96
148
  swap_space=swap_space,
97
149
  cpu_offload_gb=cpu_offload_gb,
98
150
  enforce_eager=enforce_eager,
99
- max_context_len_to_capture=max_context_len_to_capture,
100
151
  max_seq_len_to_capture=max_seq_len_to_capture,
101
152
  disable_custom_all_reduce=disable_custom_all_reduce,
153
+ disable_async_output_proc=disable_async_output_proc,
154
+ hf_overrides=hf_overrides,
155
+ mm_processor_kwargs=mm_processor_kwargs,
156
+ override_pooler_config=override_pooler_config,
157
+ compilation_config=compilation_config_instance,
102
158
  **kwargs,
103
159
  )
104
- self.llm_engine = IPEXLLMLLMEngine.from_engine_args(
160
+ # Logic to switch between engines is done at runtime instead of import
161
+ # to avoid import order issues
162
+ self.engine_class = self.get_engine_class()
163
+ self.llm_engine = self.engine_class.from_engine_args(
105
164
  engine_args, usage_context=UsageContext.LLM_CLASS,
106
165
  load_in_low_bit=load_in_low_bit)
166
+
107
167
  self.request_counter = Counter()
108
168
 
169
+ @staticmethod
170
+ def get_engine_class() -> Type[LLMEngine]:
171
+ if envs.VLLM_USE_V1:
172
+ return IPEXLLMLLMV1Engine
173
+ return IPEXLLMLLMEngine
174
+
175
+
176
+ class IPEXLLMLLMV1Engine(V1LLMEngine):
177
+ def __init__(self, *args, **kwargs):
178
+ super().__init__(*args, **kwargs)
179
+
180
+ @classmethod
181
+ def from_engine_args(
182
+ cls,
183
+ engine_args: EngineArgs,
184
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
185
+ stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
186
+ enable_multiprocessing: bool = False,
187
+ load_in_low_bit: str = "sym_int4",
188
+ ) -> "LLMEngine":
189
+ """Creates an LLM engine from the engine arguments."""
190
+ # Create the engine configs.
191
+
192
+ _ipex_llm_convert(load_in_low_bit)
193
+ return super().from_engine_args(engine_args,
194
+ usage_context,
195
+ stat_loggers,
196
+ enable_multiprocessing)
197
+
109
198
 
110
199
  class IPEXLLMLLMEngine(LLMEngine):
111
200
  def __init__(self, *args, **kwargs):
@@ -134,16 +223,24 @@ class IPEXLLMMQLLMEngine(MQLLMEngine):
134
223
 
135
224
 
136
225
  def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
137
- ipc_path: str, load_in_low_bit: str):
226
+ ipc_path: str, load_in_low_bit: str, engine_alive):
138
227
 
139
228
  def signal_handler(*_) -> None:
140
229
  # Interrupt server on sigterm
141
230
  raise KeyboardInterrupt("MQLLMEngine terminated") # noqa
142
231
 
143
- signal.signal(signal.SIGTERM, signal_handler)
232
+ try:
233
+ signal.signal(signal.SIGTERM, signal_handler)
234
+
235
+ engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
236
+ usage_context=usage_context,
237
+ ipc_path=ipc_path,
238
+ load_in_low_bit=load_in_low_bit)
239
+ engine.start()
240
+ except BaseException as e:
241
+ logger.exception(e)
242
+ engine_alive.value = False
243
+ raise e # noqa
144
244
 
145
- engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
146
- usage_context=usage_context,
147
- ipc_path=ipc_path,
148
- load_in_low_bit=load_in_low_bit)
149
- engine.start()
245
+ if os.getenv("VLLM_USE_V1"):
246
+ IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine