ipex-llm 2.3.0b20250427__py3-none-win_amd64.whl → 2.3.0b20250501__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/convert.py +3 -2
  31. ipex_llm/vllm/xpu/engine/__init__.py +3 -1
  32. ipex_llm/vllm/xpu/engine/engine.py +163 -19
  33. ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +448 -180
  34. ipex_llm/vllm/xpu/model_convert.py +5 -2
  35. {ipex_llm-2.3.0b20250427.dist-info → ipex_llm-2.3.0b20250501.dist-info}/METADATA +11 -11
  36. {ipex_llm-2.3.0b20250427.dist-info → ipex_llm-2.3.0b20250501.dist-info}/RECORD +42 -42
  37. {ipex_llm-2.3.0b20250427.data → ipex_llm-2.3.0b20250501.data}/scripts/ipex-llm-init.bat +0 -0
  38. {ipex_llm-2.3.0b20250427.data → ipex_llm-2.3.0b20250501.data}/scripts/llm-chat.ps1 +0 -0
  39. {ipex_llm-2.3.0b20250427.data → ipex_llm-2.3.0b20250501.data}/scripts/llm-cli.ps1 +0 -0
  40. {ipex_llm-2.3.0b20250427.dist-info → ipex_llm-2.3.0b20250501.dist-info}/WHEEL +0 -0
  41. {ipex_llm-2.3.0b20250427.dist-info → ipex_llm-2.3.0b20250501.dist-info}/entry_points.txt +0 -0
  42. {ipex_llm-2.3.0b20250427.dist-info → ipex_llm-2.3.0b20250501.dist-info}/top_level.txt +0 -0
Binary file
ipex_llm/libs/bloom.dll CHANGED
Binary file
Binary file
ipex_llm/libs/gptneox.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
ipex_llm/libs/llama.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -150,12 +150,13 @@ def is_linear_module(module):
150
150
  if _VLLM_VERSION is None:
151
151
  _VLLM_VERSION = get_package_version('vllm')
152
152
  from vllm.model_executor.layers.linear import (
153
- ColumnParallelLinear, RowParallelLinear, QKVParallelLinear, MergedColumnParallelLinear
153
+ ColumnParallelLinear, RowParallelLinear, QKVParallelLinear,
154
+ MergedColumnParallelLinear, ReplicatedLinear
154
155
  )
155
156
  from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
156
157
  VLLM_LINEAR_LIST = [
157
158
  ColumnParallelLinear, RowParallelLinear, QKVParallelLinear,
158
- MergedColumnParallelLinear,
159
+ MergedColumnParallelLinear, ReplicatedLinear,
159
160
  ]
160
161
  if 'xpu' in _VLLM_VERSION:
161
162
  VLLM_LINEAR_LIST.append(ParallelLMHead)
@@ -13,10 +13,12 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
  #
16
- from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine
16
+ from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine, IPEXLLMAsyncV1Engine, IPEXLLMLLMV1Engine
17
17
  __all__ = [
18
18
  "IPEXLLMAsyncLLMEngine",
19
19
  "IPEXLLMLLMEngine",
20
20
  "IPEXLLMClass",
21
+ "IPEXLLMAsyncV1Engine",
22
+ "IPEXLLMLLMV1Engine",
21
23
  "run_mp_engine",
22
24
  ]
@@ -38,6 +38,8 @@ logger = init_logger(__name__)
38
38
 
39
39
 
40
40
  class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
41
+ _is_converted = False
42
+
41
43
  def __init__(self, *args, **kwargs):
42
44
  super().__init__(*args, **kwargs)
43
45
 
@@ -53,13 +55,39 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
53
55
  ) -> "AsyncLLMEngine":
54
56
  """Creates an async LLM engine from the engine arguments."""
55
57
  # Create the engine configs.
56
- _ipex_llm_convert(load_in_low_bit)
58
+ if not cls._is_converted:
59
+ _ipex_llm_convert(load_in_low_bit)
60
+ cls._is_converted = True
57
61
  return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
58
62
  start_engine_loop=start_engine_loop,
59
63
  usage_context=usage_context, stat_loggers=stat_loggers)
60
64
 
65
+ @classmethod
66
+ def from_vllm_config(
67
+ cls,
68
+ vllm_config: VllmConfig,
69
+ start_engine_loop: bool = True,
70
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
71
+ stat_loggers: Optional[dict[str, StatLoggerBase]]=None,
72
+ disable_log_requests: bool = False,
73
+ disable_log_stats: bool = False,
74
+ load_in_low_bit: str = "sym_int4",
75
+ ) -> "AsyncLLMEngine":
76
+ if not cls._is_converted:
77
+ _ipex_llm_convert(load_in_low_bit)
78
+ cls._is_converted = True
79
+ return super().from_vllm_config(
80
+ vllm_config=vllm_config,
81
+ start_engine_loop=start_engine_loop,
82
+ usage_context=usage_context,
83
+ stat_loggers=stat_loggers,
84
+ disable_log_requests=disable_log_requests,
85
+ disable_log_stats=disable_log_stats,
86
+ )
87
+
61
88
 
62
89
  class IPEXLLMAsyncV1Engine(AsyncLLM):
90
+ _is_converted = False
63
91
 
64
92
  def __init__(self, *args, **kwargs):
65
93
  super().__init__(*args, **kwargs)
@@ -74,13 +102,39 @@ class IPEXLLMAsyncV1Engine(AsyncLLM):
74
102
  load_in_low_bit: str = "sym_int4",
75
103
  stat_loggers: Optional[Dict[str, StatLoggerBase]]=None, # noqa
76
104
  ) -> "AsyncLLM":
77
- _ipex_llm_convert(load_in_low_bit)
105
+ if not cls._is_converted:
106
+ _ipex_llm_convert(load_in_low_bit)
107
+ cls._is_converted = True
78
108
  return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
79
109
  start_engine_loop=start_engine_loop,
80
110
  usage_context=usage_context, stat_loggers=stat_loggers)
81
111
 
112
+ @classmethod
113
+ def from_vllm_config(
114
+ cls,
115
+ vllm_config: VllmConfig,
116
+ start_engine_loop: bool = True,
117
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
118
+ stat_loggers: Optional[dict[str, StatLoggerBase]]=None,
119
+ disable_log_requests: bool = False,
120
+ disable_log_stats: bool = False,
121
+ load_in_low_bit: str = "sym_int4",
122
+ ) -> "AsyncLLM":
123
+ if not cls._is_converted:
124
+ _ipex_llm_convert(load_in_low_bit)
125
+ cls._is_converted = True
126
+ return super().from_vllm_config(
127
+ vllm_config=vllm_config,
128
+ start_engine_loop=start_engine_loop,
129
+ usage_context=usage_context,
130
+ stat_loggers=stat_loggers,
131
+ disable_log_requests=disable_log_requests,
132
+ disable_log_stats=disable_log_stats,
133
+ )
134
+
82
135
 
83
136
  class IPEXLLMClass(LLM):
137
+
84
138
  def __init__(
85
139
  self,
86
140
  model: str,
@@ -94,20 +148,20 @@ class IPEXLLMClass(LLM):
94
148
  quantization: Optional[str] = None,
95
149
  revision: Optional[str] = None,
96
150
  tokenizer_revision: Optional[str] = None,
97
- seed: int = 0,
151
+ seed: Optional[int] = None,
98
152
  gpu_memory_utilization: float = 0.9,
99
153
  swap_space: float = 4,
100
154
  cpu_offload_gb: float = 0,
101
155
  enforce_eager: Optional[bool] = None,
102
156
  max_seq_len_to_capture: int = 8192,
103
157
  disable_custom_all_reduce: bool = False,
104
- disable_async_output_proc: bool = True,
105
- hf_overrides: Optional[HfOverrides] = None,
106
- mm_processor_kwargs: Optional[Dict[str, Any]]=None,
158
+ disable_async_output_proc: bool = False,
159
+ hf_overrides: Optional[HfOverrides]=None,
160
+ mm_processor_kwargs: Optional[dict[str, Any]]=None,
107
161
  # After positional args are removed, move this right below `model`
108
162
  task: TaskOption = "auto",
109
163
  override_pooler_config: Optional[PoolerConfig] = None,
110
- compilation_config: Optional[Union[int, Dict[str, Any]]]=None,
164
+ compilation_config: Optional[Union[int, dict[str, Any]]]=None,
111
165
  load_in_low_bit: str = "sym_int4",
112
166
  **kwargs,
113
167
  ) -> None:
@@ -120,6 +174,13 @@ class IPEXLLMClass(LLM):
120
174
  if "disable_log_stats" not in kwargs:
121
175
  kwargs["disable_log_stats"] = True
122
176
 
177
+ if "worker_cls" in kwargs:
178
+ worker_cls = kwargs["worker_cls"]
179
+ # if the worker_cls is not qualified string name,
180
+ # we serialize it using cloudpickle to avoid pickling issues
181
+ if isinstance(worker_cls, type):
182
+ kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)
183
+
123
184
  if compilation_config is not None:
124
185
  if isinstance(compilation_config, (int, dict)):
125
186
  compilation_config_instance = CompilationConfig.from_cli(
@@ -159,11 +220,13 @@ class IPEXLLMClass(LLM):
159
220
  # Logic to switch between engines is done at runtime instead of import
160
221
  # to avoid import order issues
161
222
  self.engine_class = self.get_engine_class()
223
+ # print("!!! ", load_in_low_bit)
162
224
  self.llm_engine = self.engine_class.from_engine_args(
163
225
  engine_args, usage_context=UsageContext.LLM_CLASS,
164
226
  load_in_low_bit=load_in_low_bit)
165
227
 
166
228
  self.request_counter = Counter()
229
+ self.default_sampling_params: Union[dict[str, Any], None] = None
167
230
 
168
231
  @staticmethod
169
232
  def get_engine_class() -> Type[LLMEngine]:
@@ -173,6 +236,8 @@ class IPEXLLMClass(LLM):
173
236
 
174
237
 
175
238
  class IPEXLLMLLMV1Engine(V1LLMEngine):
239
+ _is_converted = False
240
+
176
241
  def __init__(self, *args, **kwargs):
177
242
  super().__init__(*args, **kwargs)
178
243
 
@@ -188,14 +253,37 @@ class IPEXLLMLLMV1Engine(V1LLMEngine):
188
253
  """Creates an LLM engine from the engine arguments."""
189
254
  # Create the engine configs.
190
255
 
191
- _ipex_llm_convert(load_in_low_bit)
256
+ if not cls._is_converted:
257
+ _ipex_llm_convert(load_in_low_bit)
258
+ cls._is_converted = True
192
259
  return super().from_engine_args(engine_args,
193
260
  usage_context,
194
261
  stat_loggers,
195
262
  enable_multiprocessing)
196
263
 
264
+ @classmethod
265
+ def from_vllm_config(
266
+ cls,
267
+ vllm_config: VllmConfig,
268
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
269
+ stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
270
+ disable_log_stats: bool = False,
271
+ load_in_low_bit: str = "sym_int4",
272
+ ) -> "LLMEngine":
273
+ if not cls._is_converted:
274
+ _ipex_llm_convert(load_in_low_bit)
275
+ cls._is_converted = True
276
+ return super().from_vllm_config(
277
+ vllm_config=vllm_config,
278
+ usage_context=usage_context,
279
+ stat_loggers=stat_loggers,
280
+ disable_log_stats=disable_log_stats
281
+ )
282
+
197
283
 
198
284
  class IPEXLLMLLMEngine(LLMEngine):
285
+ _is_converted = False
286
+
199
287
  def __init__(self, *args, **kwargs):
200
288
  super().__init__(*args, **kwargs)
201
289
 
@@ -209,33 +297,89 @@ class IPEXLLMLLMEngine(LLMEngine):
209
297
  ) -> "LLMEngine":
210
298
  """Creates an LLM engine from the engine arguments."""
211
299
  # Create the engine configs.
212
- _ipex_llm_convert(load_in_low_bit)
300
+ if not cls._is_converted:
301
+ _ipex_llm_convert(load_in_low_bit)
302
+ cls._is_converted = True
213
303
  return super().from_engine_args(engine_args, usage_context, stat_loggers)
214
304
 
305
+ @classmethod
306
+ def from_vllm_config(
307
+ cls,
308
+ vllm_config: VllmConfig,
309
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
310
+ stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
311
+ disable_log_stats: bool = False,
312
+ load_in_low_bit: str = "sym_int4",
313
+ ) -> "LLMEngine":
314
+ if not cls._is_converted:
315
+ _ipex_llm_convert(load_in_low_bit)
316
+ cls._is_converted = True
317
+ return super().from_vllm_config(
318
+ vllm_config=vllm_config,
319
+ usage_context=usage_context,
320
+ stat_loggers=stat_loggers,
321
+ disable_log_stats=disable_log_stats
322
+ )
323
+
215
324
 
216
325
  class IPEXLLMMQLLMEngine(MQLLMEngine):
326
+ _is_converted = False
327
+
328
+ def __init__(self, *args, **kwargs):
329
+ super().__init__(*args, **kwargs)
330
+
217
331
  @classmethod
218
332
  def from_engine_args(cls, engine_args: AsyncEngineArgs,
219
333
  usage_context: UsageContext, ipc_path: str, load_in_low_bit: str):
220
- _ipex_llm_convert(load_in_low_bit)
334
+ if not cls._is_converted:
335
+ _ipex_llm_convert(load_in_low_bit)
336
+ cls._is_converted = True
221
337
  return super().from_engine_args(engine_args, usage_context, ipc_path)
222
338
 
339
+ @classmethod
340
+ def from_vllm_config(cls, vllm_config: VllmConfig,
341
+ usage_context: UsageContext,
342
+ disable_log_requests: bool, disable_log_stats: bool,
343
+ ipc_path: str, load_in_low_bit: str) -> "MQLLMEngine":
344
+
345
+ if not cls._is_converted:
346
+ _ipex_llm_convert(load_in_low_bit)
347
+ cls._is_converted = True
348
+ return super().from_vllm_config(
349
+ vllm_config=vllm_config,
350
+ ipc_path=ipc_path,
351
+ usage_context=usage_context,
352
+ disable_log_requests=disable_log_requests,
353
+ disable_log_stats=disable_log_stats,
354
+ )
355
+
356
+ from vllm.transformers_utils.config import (
357
+ maybe_register_config_serialize_by_value)
358
+
223
359
 
224
- def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
225
- ipc_path: str, load_in_low_bit: str, engine_alive):
360
+ def signal_handler(*_) -> None:
361
+ raise KeyboardInterrupt("MQLLMEngine terminated") # noqa
226
362
 
227
- def signal_handler(*_) -> None:
228
- # Interrupt server on sigterm
229
- raise KeyboardInterrupt("MQLLMEngine terminated") # noqa
230
363
 
364
+ def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
365
+ ipc_path: str, disable_log_stats: bool,
366
+ disable_log_requests: bool, load_in_low_bit, engine_alive):
231
367
  try:
368
+ # Ensure we can serialize transformer config before spawning
369
+ maybe_register_config_serialize_by_value()
370
+
371
+ engine = IPEXLLMMQLLMEngine.from_vllm_config(
372
+ vllm_config=vllm_config,
373
+ usage_context=usage_context,
374
+ disable_log_stats=disable_log_stats,
375
+ disable_log_requests=disable_log_requests,
376
+ load_in_low_bit=load_in_low_bit,
377
+ ipc_path=ipc_path)
378
+
232
379
  signal.signal(signal.SIGTERM, signal_handler)
233
380
 
234
- engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
235
- usage_context=usage_context,
236
- ipc_path=ipc_path,
237
- load_in_low_bit=load_in_low_bit)
238
381
  engine.start()
382
+
239
383
  except BaseException as e:
240
384
  logger.exception(e)
241
385
  engine_alive.value = False