ipex-llm 2.3.0b20250428__py3-none-win_amd64.whl → 2.3.0b20250502__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/convert.py +3 -2
- ipex_llm/vllm/xpu/engine/__init__.py +3 -1
- ipex_llm/vllm/xpu/engine/engine.py +163 -19
- ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +448 -180
- ipex_llm/vllm/xpu/model_convert.py +5 -2
- {ipex_llm-2.3.0b20250428.dist-info → ipex_llm-2.3.0b20250502.dist-info}/METADATA +11 -11
- {ipex_llm-2.3.0b20250428.dist-info → ipex_llm-2.3.0b20250502.dist-info}/RECORD +42 -42
- {ipex_llm-2.3.0b20250428.data → ipex_llm-2.3.0b20250502.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.3.0b20250428.data → ipex_llm-2.3.0b20250502.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.3.0b20250428.data → ipex_llm-2.3.0b20250502.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.3.0b20250428.dist-info → ipex_llm-2.3.0b20250502.dist-info}/WHEEL +0 -0
- {ipex_llm-2.3.0b20250428.dist-info → ipex_llm-2.3.0b20250502.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.3.0b20250428.dist-info → ipex_llm-2.3.0b20250502.dist-info}/top_level.txt +0 -0
ipex_llm/libs/bloom-api.dll
CHANGED
Binary file
|
ipex_llm/libs/bloom.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox-api.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_vnni.dll
CHANGED
Binary file
|
ipex_llm/libs/libgptneox_avx.dll
CHANGED
Binary file
|
Binary file
|
ipex_llm/libs/libllama_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libllama_vnni.dll
CHANGED
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/llama-api.dll
CHANGED
Binary file
|
ipex_llm/libs/llama.dll
CHANGED
Binary file
|
ipex_llm/libs/main-bloom.exe
CHANGED
Binary file
|
ipex_llm/libs/main-gptneox.exe
CHANGED
Binary file
|
ipex_llm/libs/main-llama.exe
CHANGED
Binary file
|
ipex_llm/libs/main-starcoder.exe
CHANGED
Binary file
|
ipex_llm/libs/pipeline.dll
CHANGED
Binary file
|
ipex_llm/libs/quantize-bloom.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/quantize-llama.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/starcoder-api.dll
CHANGED
Binary file
|
ipex_llm/libs/starcoder.dll
CHANGED
Binary file
|
ipex_llm/transformers/convert.py
CHANGED
@@ -150,12 +150,13 @@ def is_linear_module(module):
|
|
150
150
|
if _VLLM_VERSION is None:
|
151
151
|
_VLLM_VERSION = get_package_version('vllm')
|
152
152
|
from vllm.model_executor.layers.linear import (
|
153
|
-
ColumnParallelLinear, RowParallelLinear, QKVParallelLinear,
|
153
|
+
ColumnParallelLinear, RowParallelLinear, QKVParallelLinear,
|
154
|
+
MergedColumnParallelLinear, ReplicatedLinear
|
154
155
|
)
|
155
156
|
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
156
157
|
VLLM_LINEAR_LIST = [
|
157
158
|
ColumnParallelLinear, RowParallelLinear, QKVParallelLinear,
|
158
|
-
MergedColumnParallelLinear,
|
159
|
+
MergedColumnParallelLinear, ReplicatedLinear,
|
159
160
|
]
|
160
161
|
if 'xpu' in _VLLM_VERSION:
|
161
162
|
VLLM_LINEAR_LIST.append(ParallelLMHead)
|
@@ -13,10 +13,12 @@
|
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
15
|
#
|
16
|
-
from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine
|
16
|
+
from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine, IPEXLLMAsyncV1Engine, IPEXLLMLLMV1Engine
|
17
17
|
__all__ = [
|
18
18
|
"IPEXLLMAsyncLLMEngine",
|
19
19
|
"IPEXLLMLLMEngine",
|
20
20
|
"IPEXLLMClass",
|
21
|
+
"IPEXLLMAsyncV1Engine",
|
22
|
+
"IPEXLLMLLMV1Engine",
|
21
23
|
"run_mp_engine",
|
22
24
|
]
|
@@ -38,6 +38,8 @@ logger = init_logger(__name__)
|
|
38
38
|
|
39
39
|
|
40
40
|
class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
|
41
|
+
_is_converted = False
|
42
|
+
|
41
43
|
def __init__(self, *args, **kwargs):
|
42
44
|
super().__init__(*args, **kwargs)
|
43
45
|
|
@@ -53,13 +55,39 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
|
|
53
55
|
) -> "AsyncLLMEngine":
|
54
56
|
"""Creates an async LLM engine from the engine arguments."""
|
55
57
|
# Create the engine configs.
|
56
|
-
|
58
|
+
if not cls._is_converted:
|
59
|
+
_ipex_llm_convert(load_in_low_bit)
|
60
|
+
cls._is_converted = True
|
57
61
|
return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
|
58
62
|
start_engine_loop=start_engine_loop,
|
59
63
|
usage_context=usage_context, stat_loggers=stat_loggers)
|
60
64
|
|
65
|
+
@classmethod
|
66
|
+
def from_vllm_config(
|
67
|
+
cls,
|
68
|
+
vllm_config: VllmConfig,
|
69
|
+
start_engine_loop: bool = True,
|
70
|
+
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
71
|
+
stat_loggers: Optional[dict[str, StatLoggerBase]]=None,
|
72
|
+
disable_log_requests: bool = False,
|
73
|
+
disable_log_stats: bool = False,
|
74
|
+
load_in_low_bit: str = "sym_int4",
|
75
|
+
) -> "AsyncLLMEngine":
|
76
|
+
if not cls._is_converted:
|
77
|
+
_ipex_llm_convert(load_in_low_bit)
|
78
|
+
cls._is_converted = True
|
79
|
+
return super().from_vllm_config(
|
80
|
+
vllm_config=vllm_config,
|
81
|
+
start_engine_loop=start_engine_loop,
|
82
|
+
usage_context=usage_context,
|
83
|
+
stat_loggers=stat_loggers,
|
84
|
+
disable_log_requests=disable_log_requests,
|
85
|
+
disable_log_stats=disable_log_stats,
|
86
|
+
)
|
87
|
+
|
61
88
|
|
62
89
|
class IPEXLLMAsyncV1Engine(AsyncLLM):
|
90
|
+
_is_converted = False
|
63
91
|
|
64
92
|
def __init__(self, *args, **kwargs):
|
65
93
|
super().__init__(*args, **kwargs)
|
@@ -74,13 +102,39 @@ class IPEXLLMAsyncV1Engine(AsyncLLM):
|
|
74
102
|
load_in_low_bit: str = "sym_int4",
|
75
103
|
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None, # noqa
|
76
104
|
) -> "AsyncLLM":
|
77
|
-
|
105
|
+
if not cls._is_converted:
|
106
|
+
_ipex_llm_convert(load_in_low_bit)
|
107
|
+
cls._is_converted = True
|
78
108
|
return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
|
79
109
|
start_engine_loop=start_engine_loop,
|
80
110
|
usage_context=usage_context, stat_loggers=stat_loggers)
|
81
111
|
|
112
|
+
@classmethod
|
113
|
+
def from_vllm_config(
|
114
|
+
cls,
|
115
|
+
vllm_config: VllmConfig,
|
116
|
+
start_engine_loop: bool = True,
|
117
|
+
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
118
|
+
stat_loggers: Optional[dict[str, StatLoggerBase]]=None,
|
119
|
+
disable_log_requests: bool = False,
|
120
|
+
disable_log_stats: bool = False,
|
121
|
+
load_in_low_bit: str = "sym_int4",
|
122
|
+
) -> "AsyncLLM":
|
123
|
+
if not cls._is_converted:
|
124
|
+
_ipex_llm_convert(load_in_low_bit)
|
125
|
+
cls._is_converted = True
|
126
|
+
return super().from_vllm_config(
|
127
|
+
vllm_config=vllm_config,
|
128
|
+
start_engine_loop=start_engine_loop,
|
129
|
+
usage_context=usage_context,
|
130
|
+
stat_loggers=stat_loggers,
|
131
|
+
disable_log_requests=disable_log_requests,
|
132
|
+
disable_log_stats=disable_log_stats,
|
133
|
+
)
|
134
|
+
|
82
135
|
|
83
136
|
class IPEXLLMClass(LLM):
|
137
|
+
|
84
138
|
def __init__(
|
85
139
|
self,
|
86
140
|
model: str,
|
@@ -94,20 +148,20 @@ class IPEXLLMClass(LLM):
|
|
94
148
|
quantization: Optional[str] = None,
|
95
149
|
revision: Optional[str] = None,
|
96
150
|
tokenizer_revision: Optional[str] = None,
|
97
|
-
seed: int =
|
151
|
+
seed: Optional[int] = None,
|
98
152
|
gpu_memory_utilization: float = 0.9,
|
99
153
|
swap_space: float = 4,
|
100
154
|
cpu_offload_gb: float = 0,
|
101
155
|
enforce_eager: Optional[bool] = None,
|
102
156
|
max_seq_len_to_capture: int = 8192,
|
103
157
|
disable_custom_all_reduce: bool = False,
|
104
|
-
disable_async_output_proc: bool =
|
105
|
-
hf_overrides: Optional[HfOverrides]
|
106
|
-
mm_processor_kwargs: Optional[
|
158
|
+
disable_async_output_proc: bool = False,
|
159
|
+
hf_overrides: Optional[HfOverrides]=None,
|
160
|
+
mm_processor_kwargs: Optional[dict[str, Any]]=None,
|
107
161
|
# After positional args are removed, move this right below `model`
|
108
162
|
task: TaskOption = "auto",
|
109
163
|
override_pooler_config: Optional[PoolerConfig] = None,
|
110
|
-
compilation_config: Optional[Union[int,
|
164
|
+
compilation_config: Optional[Union[int, dict[str, Any]]]=None,
|
111
165
|
load_in_low_bit: str = "sym_int4",
|
112
166
|
**kwargs,
|
113
167
|
) -> None:
|
@@ -120,6 +174,13 @@ class IPEXLLMClass(LLM):
|
|
120
174
|
if "disable_log_stats" not in kwargs:
|
121
175
|
kwargs["disable_log_stats"] = True
|
122
176
|
|
177
|
+
if "worker_cls" in kwargs:
|
178
|
+
worker_cls = kwargs["worker_cls"]
|
179
|
+
# if the worker_cls is not qualified string name,
|
180
|
+
# we serialize it using cloudpickle to avoid pickling issues
|
181
|
+
if isinstance(worker_cls, type):
|
182
|
+
kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)
|
183
|
+
|
123
184
|
if compilation_config is not None:
|
124
185
|
if isinstance(compilation_config, (int, dict)):
|
125
186
|
compilation_config_instance = CompilationConfig.from_cli(
|
@@ -159,11 +220,13 @@ class IPEXLLMClass(LLM):
|
|
159
220
|
# Logic to switch between engines is done at runtime instead of import
|
160
221
|
# to avoid import order issues
|
161
222
|
self.engine_class = self.get_engine_class()
|
223
|
+
# print("!!! ", load_in_low_bit)
|
162
224
|
self.llm_engine = self.engine_class.from_engine_args(
|
163
225
|
engine_args, usage_context=UsageContext.LLM_CLASS,
|
164
226
|
load_in_low_bit=load_in_low_bit)
|
165
227
|
|
166
228
|
self.request_counter = Counter()
|
229
|
+
self.default_sampling_params: Union[dict[str, Any], None] = None
|
167
230
|
|
168
231
|
@staticmethod
|
169
232
|
def get_engine_class() -> Type[LLMEngine]:
|
@@ -173,6 +236,8 @@ class IPEXLLMClass(LLM):
|
|
173
236
|
|
174
237
|
|
175
238
|
class IPEXLLMLLMV1Engine(V1LLMEngine):
|
239
|
+
_is_converted = False
|
240
|
+
|
176
241
|
def __init__(self, *args, **kwargs):
|
177
242
|
super().__init__(*args, **kwargs)
|
178
243
|
|
@@ -188,14 +253,37 @@ class IPEXLLMLLMV1Engine(V1LLMEngine):
|
|
188
253
|
"""Creates an LLM engine from the engine arguments."""
|
189
254
|
# Create the engine configs.
|
190
255
|
|
191
|
-
|
256
|
+
if not cls._is_converted:
|
257
|
+
_ipex_llm_convert(load_in_low_bit)
|
258
|
+
cls._is_converted = True
|
192
259
|
return super().from_engine_args(engine_args,
|
193
260
|
usage_context,
|
194
261
|
stat_loggers,
|
195
262
|
enable_multiprocessing)
|
196
263
|
|
264
|
+
@classmethod
|
265
|
+
def from_vllm_config(
|
266
|
+
cls,
|
267
|
+
vllm_config: VllmConfig,
|
268
|
+
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
269
|
+
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
|
270
|
+
disable_log_stats: bool = False,
|
271
|
+
load_in_low_bit: str = "sym_int4",
|
272
|
+
) -> "LLMEngine":
|
273
|
+
if not cls._is_converted:
|
274
|
+
_ipex_llm_convert(load_in_low_bit)
|
275
|
+
cls._is_converted = True
|
276
|
+
return super().from_vllm_config(
|
277
|
+
vllm_config=vllm_config,
|
278
|
+
usage_context=usage_context,
|
279
|
+
stat_loggers=stat_loggers,
|
280
|
+
disable_log_stats=disable_log_stats
|
281
|
+
)
|
282
|
+
|
197
283
|
|
198
284
|
class IPEXLLMLLMEngine(LLMEngine):
|
285
|
+
_is_converted = False
|
286
|
+
|
199
287
|
def __init__(self, *args, **kwargs):
|
200
288
|
super().__init__(*args, **kwargs)
|
201
289
|
|
@@ -209,33 +297,89 @@ class IPEXLLMLLMEngine(LLMEngine):
|
|
209
297
|
) -> "LLMEngine":
|
210
298
|
"""Creates an LLM engine from the engine arguments."""
|
211
299
|
# Create the engine configs.
|
212
|
-
|
300
|
+
if not cls._is_converted:
|
301
|
+
_ipex_llm_convert(load_in_low_bit)
|
302
|
+
cls._is_converted = True
|
213
303
|
return super().from_engine_args(engine_args, usage_context, stat_loggers)
|
214
304
|
|
305
|
+
@classmethod
|
306
|
+
def from_vllm_config(
|
307
|
+
cls,
|
308
|
+
vllm_config: VllmConfig,
|
309
|
+
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
310
|
+
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
|
311
|
+
disable_log_stats: bool = False,
|
312
|
+
load_in_low_bit: str = "sym_int4",
|
313
|
+
) -> "LLMEngine":
|
314
|
+
if not cls._is_converted:
|
315
|
+
_ipex_llm_convert(load_in_low_bit)
|
316
|
+
cls._is_converted = True
|
317
|
+
return super().from_vllm_config(
|
318
|
+
vllm_config=vllm_config,
|
319
|
+
usage_context=usage_context,
|
320
|
+
stat_loggers=stat_loggers,
|
321
|
+
disable_log_stats=disable_log_stats
|
322
|
+
)
|
323
|
+
|
215
324
|
|
216
325
|
class IPEXLLMMQLLMEngine(MQLLMEngine):
|
326
|
+
_is_converted = False
|
327
|
+
|
328
|
+
def __init__(self, *args, **kwargs):
|
329
|
+
super().__init__(*args, **kwargs)
|
330
|
+
|
217
331
|
@classmethod
|
218
332
|
def from_engine_args(cls, engine_args: AsyncEngineArgs,
|
219
333
|
usage_context: UsageContext, ipc_path: str, load_in_low_bit: str):
|
220
|
-
|
334
|
+
if not cls._is_converted:
|
335
|
+
_ipex_llm_convert(load_in_low_bit)
|
336
|
+
cls._is_converted = True
|
221
337
|
return super().from_engine_args(engine_args, usage_context, ipc_path)
|
222
338
|
|
339
|
+
@classmethod
|
340
|
+
def from_vllm_config(cls, vllm_config: VllmConfig,
|
341
|
+
usage_context: UsageContext,
|
342
|
+
disable_log_requests: bool, disable_log_stats: bool,
|
343
|
+
ipc_path: str, load_in_low_bit: str) -> "MQLLMEngine":
|
344
|
+
|
345
|
+
if not cls._is_converted:
|
346
|
+
_ipex_llm_convert(load_in_low_bit)
|
347
|
+
cls._is_converted = True
|
348
|
+
return super().from_vllm_config(
|
349
|
+
vllm_config=vllm_config,
|
350
|
+
ipc_path=ipc_path,
|
351
|
+
usage_context=usage_context,
|
352
|
+
disable_log_requests=disable_log_requests,
|
353
|
+
disable_log_stats=disable_log_stats,
|
354
|
+
)
|
355
|
+
|
356
|
+
from vllm.transformers_utils.config import (
|
357
|
+
maybe_register_config_serialize_by_value)
|
358
|
+
|
223
359
|
|
224
|
-
def
|
225
|
-
|
360
|
+
def signal_handler(*_) -> None:
|
361
|
+
raise KeyboardInterrupt("MQLLMEngine terminated") # noqa
|
226
362
|
|
227
|
-
def signal_handler(*_) -> None:
|
228
|
-
# Interrupt server on sigterm
|
229
|
-
raise KeyboardInterrupt("MQLLMEngine terminated") # noqa
|
230
363
|
|
364
|
+
def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
|
365
|
+
ipc_path: str, disable_log_stats: bool,
|
366
|
+
disable_log_requests: bool, load_in_low_bit, engine_alive):
|
231
367
|
try:
|
368
|
+
# Ensure we can serialize transformer config before spawning
|
369
|
+
maybe_register_config_serialize_by_value()
|
370
|
+
|
371
|
+
engine = IPEXLLMMQLLMEngine.from_vllm_config(
|
372
|
+
vllm_config=vllm_config,
|
373
|
+
usage_context=usage_context,
|
374
|
+
disable_log_stats=disable_log_stats,
|
375
|
+
disable_log_requests=disable_log_requests,
|
376
|
+
load_in_low_bit=load_in_low_bit,
|
377
|
+
ipc_path=ipc_path)
|
378
|
+
|
232
379
|
signal.signal(signal.SIGTERM, signal_handler)
|
233
380
|
|
234
|
-
engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
|
235
|
-
usage_context=usage_context,
|
236
|
-
ipc_path=ipc_path,
|
237
|
-
load_in_low_bit=load_in_low_bit)
|
238
381
|
engine.start()
|
382
|
+
|
239
383
|
except BaseException as e:
|
240
384
|
logger.exception(e)
|
241
385
|
engine_alive.value = False
|