ipex-llm 2.2.0b20250211__py3-none-win_amd64.whl → 2.2.0b20250213__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/convert.py +6 -4
  31. ipex_llm/transformers/models/janus.py +49 -0
  32. ipex_llm/transformers/models/utils.py +1 -1
  33. ipex_llm/vllm/xpu/engine/engine.py +117 -20
  34. ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +379 -95
  35. ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py +57 -8
  36. ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py +23 -0
  37. ipex_llm/vllm/xpu/model_convert.py +25 -19
  38. {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250213.dist-info}/METADATA +19 -19
  39. {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250213.dist-info}/RECORD +45 -43
  40. {ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250213.data}/scripts/ipex-llm-init.bat +0 -0
  41. {ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250213.data}/scripts/llm-chat.ps1 +0 -0
  42. {ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250213.data}/scripts/llm-cli.ps1 +0 -0
  43. {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250213.dist-info}/WHEEL +0 -0
  44. {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250213.dist-info}/entry_points.txt +0 -0
  45. {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250213.dist-info}/top_level.txt +0 -0
Binary file
ipex_llm/libs/bloom.dll CHANGED
Binary file
Binary file
ipex_llm/libs/gptneox.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
ipex_llm/libs/llama.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -70,8 +70,9 @@ def is_auto_awq_available():
70
70
 
71
71
  def is_vllm_available():
72
72
  global _IS_VLLM_AVAILABLE
73
+ _IS_VLLM_AVAILABLE = os.getenv("IPEX_LLM_NOT_USE_VLLM", None)
73
74
  if _IS_VLLM_AVAILABLE is not None:
74
- return _IS_VLLM_AVAILABLE
75
+ return False
75
76
  import sys
76
77
  original_path = sys.path
77
78
  # Temporally remove current directory
@@ -667,7 +668,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
667
668
  out_features,
668
669
  mp_group,
669
670
  None,
670
- None,
671
671
  optimize_lm_head,
672
672
  None
673
673
  )
@@ -1066,7 +1066,7 @@ def _optimize_pre(model, qtype=None):
1066
1066
  from ipex_llm.transformers.models.baichuan_m1 import pre_register_inv_freq
1067
1067
  model.apply(pre_register_inv_freq)
1068
1068
  elif model.config.model_type == "multi_modality":
1069
- pass
1069
+ _optimize_pre(model.language_model)
1070
1070
 
1071
1071
  return model
1072
1072
 
@@ -2012,8 +2012,10 @@ def _optimize_post(model):
2012
2012
  # vision
2013
2013
  vpm_modeling_module_name = model.vision_model.vision_tower.__class__.__module__
2014
2014
  vpm_module = importlib.import_module(vpm_modeling_module_name)
2015
-
2016
2015
  from ipex_llm.transformers.models.janus import vision_attention_forward
2017
2016
  convert_forward(model.vision_model, vpm_module.Attention, vision_attention_forward)
2018
2017
 
2018
+ # llm
2019
+ _optimize_post(model.language_model)
2020
+
2019
2021
  return model
@@ -0,0 +1,49 @@
1
+ #
2
+ # Copyright 2016 The BigDL Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # This file is adapted from
17
+ # https://github.com/deepseek-ai/Janus/blob/main/janus/models/siglip_vit.py
18
+
19
+ import torch
20
+
21
+ from ipex_llm.transformers.models.common import scaled_dot_product_attention
22
+
23
+
24
+ def vision_attention_forward(self, x: torch.Tensor) -> torch.Tensor:
25
+ B, N, C = x.shape
26
+ qkv = (
27
+ self.qkv(x)
28
+ .reshape(B, N, 3, self.num_heads, self.head_dim)
29
+ .permute(2, 0, 3, 1, 4)
30
+ )
31
+ q, k, v = qkv.unbind(0)
32
+ q, k = self.q_norm(q), self.k_norm(k)
33
+
34
+ if self.fused_attn:
35
+ # ipex-llm opt: sdpa
36
+ x = scaled_dot_product_attention(
37
+ q, k.contiguous(), v.contiguous(), None, False
38
+ )
39
+ else:
40
+ q = q * self.scale
41
+ attn = q @ k.transpose(-2, -1)
42
+ attn = attn.softmax(dim=-1)
43
+ attn = self.attn_drop(attn)
44
+ x = attn @ v
45
+
46
+ x = x.transpose(1, 2).reshape(B, N, C)
47
+ x = self.proj(x)
48
+ x = self.proj_drop(x)
49
+ return x
@@ -86,7 +86,7 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
86
86
  return os.environ["IPEX_LLM_QUANTIZE_KV_CACHE"] == "1"
87
87
  elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
88
88
  return os.environ["IPEX_LLM_LOW_MEM"] == "1"
89
- elif linear.qtype in [ggml_tensor_qtype["fp16"], ggml_tensor_qtype["bf16"]]:
89
+ elif linear.weight.dtype != torch.uint8: # unquantized
90
90
  return False
91
91
  else:
92
92
  device_name = get_xpu_device_name(x.device)
@@ -13,18 +13,28 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
  #
16
- from typing import Dict, Optional
16
+ from vllm.logger import init_logger
17
+ from typing import Dict, Optional, Any, Union, Type
17
18
  from vllm.engine.llm_engine import LLMEngine
18
19
  from vllm.engine.async_llm_engine import AsyncLLMEngine
19
20
  from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
20
21
  from vllm.entrypoints.llm import LLM
21
22
  from vllm.utils import Counter
22
- from vllm.config import EngineConfig
23
+ from vllm.config import VllmConfig
23
24
  from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert
24
25
  from vllm.usage.usage_lib import UsageContext
25
26
  from vllm.engine.metrics import StatLoggerBase
26
27
  from vllm.engine.multiprocessing.engine import MQLLMEngine
27
28
  import signal
29
+ from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
30
+ TaskOption)
31
+ from vllm.config import CompilationConfig
32
+ from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
33
+ from vllm import envs
34
+ from vllm.v1.engine.async_llm import AsyncLLM
35
+ import os
36
+
37
+ logger = init_logger(__name__)
28
38
 
29
39
 
30
40
  class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
@@ -35,7 +45,7 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
35
45
  def from_engine_args(
36
46
  cls,
37
47
  engine_args: AsyncEngineArgs,
38
- engine_config: Optional[EngineConfig] = None,
48
+ engine_config: Optional[VllmConfig] = None,
39
49
  start_engine_loop: bool = True,
40
50
  usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
41
51
  load_in_low_bit: str = "sym_int4",
@@ -49,6 +59,27 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
49
59
  usage_context=usage_context, stat_loggers=stat_loggers)
50
60
 
51
61
 
62
+ class IPEXLLMAsyncV1Engine(AsyncLLM):
63
+
64
+ def __init__(self, *args, **kwargs):
65
+ super().__init__(*args, **kwargs)
66
+
67
+ @classmethod
68
+ def from_engine_args(
69
+ cls,
70
+ engine_args: AsyncEngineArgs,
71
+ engine_config: Optional[VllmConfig] = None,
72
+ start_engine_loop: bool = True,
73
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
74
+ load_in_low_bit: str = "sym_int4",
75
+ stat_loggers: Optional[Dict[str, StatLoggerBase]]=None, # noqa
76
+ ) -> "AsyncLLM":
77
+ _ipex_llm_convert(load_in_low_bit)
78
+ return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
79
+ start_engine_loop=start_engine_loop,
80
+ usage_context=usage_context, stat_loggers=stat_loggers)
81
+
82
+
52
83
  class IPEXLLMClass(LLM):
53
84
  def __init__(
54
85
  self,
@@ -57,6 +88,7 @@ class IPEXLLMClass(LLM):
57
88
  tokenizer_mode: str = "auto",
58
89
  skip_tokenizer_init: bool = False,
59
90
  trust_remote_code: bool = False,
91
+ allowed_local_media_path: str = "",
60
92
  tensor_parallel_size: int = 1,
61
93
  dtype: str = "auto",
62
94
  quantization: Optional[str] = None,
@@ -64,28 +96,48 @@ class IPEXLLMClass(LLM):
64
96
  tokenizer_revision: Optional[str] = None,
65
97
  seed: int = 0,
66
98
  gpu_memory_utilization: float = 0.9,
67
- swap_space: int = 4,
99
+ swap_space: float = 4,
68
100
  cpu_offload_gb: float = 0,
69
- enforce_eager: bool = False,
70
- max_context_len_to_capture: Optional[int] = None,
101
+ enforce_eager: Optional[bool] = None,
71
102
  max_seq_len_to_capture: int = 8192,
72
103
  disable_custom_all_reduce: bool = False,
104
+ disable_async_output_proc: bool = True,
105
+ hf_overrides: Optional[HfOverrides] = None,
106
+ mm_processor_kwargs: Optional[Dict[str, Any]]=None,
107
+ # After positional args are removed, move this right below `model`
108
+ task: TaskOption = "auto",
109
+ override_pooler_config: Optional[PoolerConfig] = None,
110
+ compilation_config: Optional[Union[int, Dict[str, Any]]]=None,
73
111
  load_in_low_bit: str = "sym_int4",
74
112
  **kwargs,
75
113
  ) -> None:
114
+ '''
115
+ LLM constructor.
116
+
117
+ Note: if enforce_eager is unset (enforce_eager is None)
118
+ it defaults to False.
119
+ '''
120
+
76
121
  if "disable_log_stats" not in kwargs:
77
122
  kwargs["disable_log_stats"] = True
78
- removed_vision_keys = ("image_token_id", "image_feature_size",
79
- "image_input_shape", "image_input_type")
80
- if any(k in kwargs for k in removed_vision_keys):
81
- raise TypeError( # noqa
82
- "There is no need to pass vision-related arguments anymore.")
123
+
124
+ if compilation_config is not None:
125
+ if isinstance(compilation_config, (int, dict)):
126
+ compilation_config_instance = CompilationConfig.from_cli(
127
+ str(compilation_config))
128
+ else:
129
+ compilation_config_instance = compilation_config
130
+ else:
131
+ compilation_config_instance = None
132
+
83
133
  engine_args = EngineArgs(
84
134
  model=model,
135
+ task=task,
85
136
  tokenizer=tokenizer,
86
137
  tokenizer_mode=tokenizer_mode,
87
138
  skip_tokenizer_init=skip_tokenizer_init,
88
139
  trust_remote_code=trust_remote_code,
140
+ allowed_local_media_path=allowed_local_media_path,
89
141
  tensor_parallel_size=tensor_parallel_size,
90
142
  dtype=dtype,
91
143
  quantization=quantization,
@@ -96,16 +148,53 @@ class IPEXLLMClass(LLM):
96
148
  swap_space=swap_space,
97
149
  cpu_offload_gb=cpu_offload_gb,
98
150
  enforce_eager=enforce_eager,
99
- max_context_len_to_capture=max_context_len_to_capture,
100
151
  max_seq_len_to_capture=max_seq_len_to_capture,
101
152
  disable_custom_all_reduce=disable_custom_all_reduce,
153
+ disable_async_output_proc=disable_async_output_proc,
154
+ hf_overrides=hf_overrides,
155
+ mm_processor_kwargs=mm_processor_kwargs,
156
+ override_pooler_config=override_pooler_config,
157
+ compilation_config=compilation_config_instance,
102
158
  **kwargs,
103
159
  )
104
- self.llm_engine = IPEXLLMLLMEngine.from_engine_args(
160
+ # Logic to switch between engines is done at runtime instead of import
161
+ # to avoid import order issues
162
+ self.engine_class = self.get_engine_class()
163
+ self.llm_engine = self.engine_class.from_engine_args(
105
164
  engine_args, usage_context=UsageContext.LLM_CLASS,
106
165
  load_in_low_bit=load_in_low_bit)
166
+
107
167
  self.request_counter = Counter()
108
168
 
169
+ @staticmethod
170
+ def get_engine_class() -> Type[LLMEngine]:
171
+ if envs.VLLM_USE_V1:
172
+ return IPEXLLMLLMV1Engine
173
+ return IPEXLLMLLMEngine
174
+
175
+
176
+ class IPEXLLMLLMV1Engine(V1LLMEngine):
177
+ def __init__(self, *args, **kwargs):
178
+ super().__init__(*args, **kwargs)
179
+
180
+ @classmethod
181
+ def from_engine_args(
182
+ cls,
183
+ engine_args: EngineArgs,
184
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
185
+ stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
186
+ enable_multiprocessing: bool = False,
187
+ load_in_low_bit: str = "sym_int4",
188
+ ) -> "LLMEngine":
189
+ """Creates an LLM engine from the engine arguments."""
190
+ # Create the engine configs.
191
+
192
+ _ipex_llm_convert(load_in_low_bit)
193
+ return super().from_engine_args(engine_args,
194
+ usage_context,
195
+ stat_loggers,
196
+ enable_multiprocessing)
197
+
109
198
 
110
199
  class IPEXLLMLLMEngine(LLMEngine):
111
200
  def __init__(self, *args, **kwargs):
@@ -134,16 +223,24 @@ class IPEXLLMMQLLMEngine(MQLLMEngine):
134
223
 
135
224
 
136
225
  def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
137
- ipc_path: str, load_in_low_bit: str):
226
+ ipc_path: str, load_in_low_bit: str, engine_alive):
138
227
 
139
228
  def signal_handler(*_) -> None:
140
229
  # Interrupt server on sigterm
141
230
  raise KeyboardInterrupt("MQLLMEngine terminated") # noqa
142
231
 
143
- signal.signal(signal.SIGTERM, signal_handler)
232
+ try:
233
+ signal.signal(signal.SIGTERM, signal_handler)
234
+
235
+ engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
236
+ usage_context=usage_context,
237
+ ipc_path=ipc_path,
238
+ load_in_low_bit=load_in_low_bit)
239
+ engine.start()
240
+ except BaseException as e:
241
+ logger.exception(e)
242
+ engine_alive.value = False
243
+ raise e # noqa
144
244
 
145
- engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
146
- usage_context=usage_context,
147
- ipc_path=ipc_path,
148
- load_in_low_bit=load_in_low_bit)
149
- engine.start()
245
+ if os.getenv("VLLM_USE_V1"):
246
+ IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine