ipex-llm 2.2.0b20250211__py3-none-win_amd64.whl → 2.2.0b20250212__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/convert.py +4 -3
- ipex_llm/transformers/models/janus.py +49 -0
- ipex_llm/transformers/models/utils.py +1 -1
- ipex_llm/vllm/xpu/engine/engine.py +117 -20
- ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +379 -95
- ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py +57 -8
- ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py +23 -0
- ipex_llm/vllm/xpu/model_convert.py +25 -19
- {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/METADATA +19 -19
- {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/RECORD +45 -43
- {ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250212.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250212.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250212.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
|
|
1
1
|
import asyncio
|
2
|
+
import atexit
|
2
3
|
import importlib
|
3
4
|
import inspect
|
4
5
|
import multiprocessing
|
@@ -7,11 +8,12 @@ import re
|
|
7
8
|
import signal
|
8
9
|
import socket
|
9
10
|
import tempfile
|
11
|
+
import uuid
|
10
12
|
from argparse import Namespace
|
11
13
|
from contextlib import asynccontextmanager
|
12
14
|
from functools import partial
|
13
15
|
from http import HTTPStatus
|
14
|
-
from typing import AsyncIterator, Set
|
16
|
+
from typing import AsyncIterator, Optional, Set, Tuple
|
15
17
|
|
16
18
|
import uvloop
|
17
19
|
from fastapi import APIRouter, FastAPI, Request
|
@@ -29,9 +31,13 @@ from ipex_llm.vllm.xpu.engine import IPEXLLMAsyncLLMEngine as AsyncLLMEngine
|
|
29
31
|
from vllm.engine.multiprocessing.client import MQLLMEngineClient
|
30
32
|
from ipex_llm.vllm.xpu.engine import run_mp_engine
|
31
33
|
from vllm.engine.protocol import EngineClient
|
34
|
+
from vllm.entrypoints.chat_utils import load_chat_template
|
32
35
|
from vllm.entrypoints.launcher import serve_http
|
33
36
|
from vllm.entrypoints.logger import RequestLogger
|
34
|
-
from
|
37
|
+
from vllm.entrypoints.openai.cli_args import (make_arg_parser,
|
38
|
+
validate_parsed_serve_args)
|
39
|
+
|
40
|
+
# from ipex_llm.vllm.xpu.entrypoints.openai.cli_args import make_arg_parser
|
35
41
|
# yapf conflicts with isort for this block
|
36
42
|
# yapf: disable
|
37
43
|
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
@@ -41,8 +47,12 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
|
41
47
|
DetokenizeRequest,
|
42
48
|
DetokenizeResponse,
|
43
49
|
EmbeddingRequest,
|
44
|
-
EmbeddingResponse,
|
50
|
+
EmbeddingResponse,
|
51
|
+
EmbeddingResponseData,
|
52
|
+
ErrorResponse,
|
45
53
|
LoadLoraAdapterRequest,
|
54
|
+
PoolingRequest, PoolingResponse,
|
55
|
+
ScoreRequest, ScoreResponse,
|
46
56
|
TokenizeRequest,
|
47
57
|
TokenizeResponse,
|
48
58
|
UnloadLoraAdapterRequest)
|
@@ -50,12 +60,20 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
|
50
60
|
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
51
61
|
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
52
62
|
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
|
53
|
-
from vllm.entrypoints.openai.
|
63
|
+
from vllm.entrypoints.openai.serving_models import (BaseModelPath,
|
64
|
+
OpenAIServingModels)
|
65
|
+
|
66
|
+
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
67
|
+
from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
|
68
|
+
from vllm.entrypoints.openai.serving_score import OpenAIServingScores
|
54
69
|
from vllm.entrypoints.openai.serving_tokenization import (
|
55
70
|
OpenAIServingTokenization)
|
71
|
+
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
|
72
|
+
from vllm.entrypoints.utils import with_cancellation
|
56
73
|
from vllm.logger import init_logger
|
57
74
|
from vllm.usage.usage_lib import UsageContext
|
58
|
-
from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path
|
75
|
+
from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
|
76
|
+
is_valid_ipv6_address, set_ulimit)
|
59
77
|
from vllm.version import __version__ as VLLM_VERSION
|
60
78
|
|
61
79
|
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
@@ -111,7 +129,7 @@ async def build_async_engine_client(
|
|
111
129
|
async def build_async_engine_client_from_engine_args(
|
112
130
|
engine_args: AsyncEngineArgs,
|
113
131
|
disable_frontend_multiprocessing: bool = False,
|
114
|
-
load_in_low_bit: str =
|
132
|
+
load_in_low_bit: str = "sym_int4",
|
115
133
|
) -> AsyncIterator[EngineClient]:
|
116
134
|
"""
|
117
135
|
Create EngineClient, either:
|
@@ -124,25 +142,19 @@ async def build_async_engine_client_from_engine_args(
|
|
124
142
|
# Fall back
|
125
143
|
# TODO: fill out feature matrix.
|
126
144
|
if (MQLLMEngineClient.is_unsupported_config(engine_args)
|
127
|
-
or disable_frontend_multiprocessing):
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
else:
|
141
|
-
engine_client = await asyncio.get_running_loop().run_in_executor(
|
142
|
-
None, build_engine)
|
143
|
-
|
144
|
-
yield engine_client
|
145
|
-
return
|
145
|
+
or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
|
146
|
+
engine_client: Optional[EngineClient] = None
|
147
|
+
try:
|
148
|
+
# When starting this, we are actually starting with the V1Engine
|
149
|
+
# Here we are doing a classification, we will need to do this in IPEX-LLM
|
150
|
+
engine_client = AsyncLLMEngine.from_engine_args(
|
151
|
+
engine_args=engine_args,
|
152
|
+
usage_context=UsageContext.OPENAI_API_SERVER,
|
153
|
+
load_in_low_bit=load_in_low_bit)
|
154
|
+
yield engine_client
|
155
|
+
finally:
|
156
|
+
if engine_client and hasattr(engine_client, "shutdown"):
|
157
|
+
engine_client.shutdown()
|
146
158
|
|
147
159
|
# Otherwise, use the multiprocessing AsyncLLMEngine.
|
148
160
|
else:
|
@@ -163,45 +175,60 @@ async def build_async_engine_client_from_engine_args(
|
|
163
175
|
|
164
176
|
# Select random path for IPC.
|
165
177
|
ipc_path = get_open_zmq_ipc_path()
|
166
|
-
logger.
|
167
|
-
|
178
|
+
logger.debug("Multiprocessing frontend to use %s for IPC Path.",
|
179
|
+
ipc_path)
|
168
180
|
|
169
181
|
# Start RPCServer in separate process (holds the LLMEngine).
|
170
182
|
# the current process might have CUDA context,
|
171
183
|
# so we need to spawn a new process
|
172
184
|
context = multiprocessing.get_context("spawn")
|
173
185
|
|
186
|
+
# The Process can raise an exception during startup, which may
|
187
|
+
# not actually result in an exitcode being reported. As a result
|
188
|
+
# we use a shared variable to communicate the information.
|
189
|
+
engine_alive = multiprocessing.Value('b', True, lock=False)
|
174
190
|
engine_process = context.Process(target=run_mp_engine,
|
175
191
|
args=(engine_args,
|
176
192
|
UsageContext.OPENAI_API_SERVER,
|
177
|
-
ipc_path,
|
178
|
-
load_in_low_bit))
|
193
|
+
ipc_path, load_in_low_bit, engine_alive))
|
179
194
|
engine_process.start()
|
180
|
-
|
195
|
+
engine_pid = engine_process.pid
|
196
|
+
assert engine_pid is not None, "Engine process failed to start."
|
197
|
+
logger.info("Started engine process with PID %d", engine_pid)
|
198
|
+
|
199
|
+
def _cleanup_ipc_path():
|
200
|
+
socket_path = ipc_path.replace("ipc://", "")
|
201
|
+
if os.path.exists(socket_path):
|
202
|
+
os.remove(socket_path)
|
203
|
+
|
204
|
+
# Ensure we clean up the local IPC socket file on exit.
|
205
|
+
atexit.register(_cleanup_ipc_path)
|
181
206
|
|
182
207
|
# Build RPCClient, which conforms to EngineClient Protocol.
|
183
|
-
# NOTE: Actually, this is not true yet. We still need to support
|
184
|
-
# embedding models via RPC (see TODO above)
|
185
208
|
engine_config = engine_args.create_engine_config()
|
186
|
-
|
187
|
-
|
209
|
+
build_client = partial(MQLLMEngineClient, ipc_path, engine_config,
|
210
|
+
engine_pid)
|
211
|
+
mq_engine_client = await asyncio.get_running_loop().run_in_executor(
|
212
|
+
None, build_client)
|
188
213
|
try:
|
189
214
|
while True:
|
190
215
|
try:
|
191
|
-
await
|
216
|
+
await mq_engine_client.setup()
|
192
217
|
break
|
193
218
|
except TimeoutError:
|
194
|
-
if not engine_process.is_alive()
|
219
|
+
if (not engine_process.is_alive()
|
220
|
+
or not engine_alive.value):
|
195
221
|
raise RuntimeError(
|
196
|
-
"Engine process failed to start
|
222
|
+
"Engine process failed to start. See stack "
|
223
|
+
"trace for the root cause.") from None
|
197
224
|
|
198
|
-
yield
|
225
|
+
yield mq_engine_client # type: ignore[misc]
|
199
226
|
finally:
|
200
227
|
# Ensure rpc server process was terminated
|
201
228
|
engine_process.terminate()
|
202
229
|
|
203
230
|
# Close all open connections to the backend
|
204
|
-
|
231
|
+
mq_engine_client.close()
|
205
232
|
|
206
233
|
# Wait for engine process to join
|
207
234
|
engine_process.join(4)
|
@@ -230,8 +257,8 @@ def mount_metrics(app: FastAPI):
|
|
230
257
|
|
231
258
|
prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
|
232
259
|
if prometheus_multiproc_dir_path is not None:
|
233
|
-
logger.
|
234
|
-
|
260
|
+
logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
|
261
|
+
prometheus_multiproc_dir_path)
|
235
262
|
registry = CollectorRegistry()
|
236
263
|
multiprocess.MultiProcessCollector(registry)
|
237
264
|
|
@@ -246,22 +273,35 @@ def mount_metrics(app: FastAPI):
|
|
246
273
|
app.routes.append(metrics_route)
|
247
274
|
|
248
275
|
|
249
|
-
def
|
276
|
+
def base(request: Request) -> OpenAIServing:
|
277
|
+
# Reuse the existing instance
|
278
|
+
return tokenization(request)
|
279
|
+
|
280
|
+
|
281
|
+
def chat(request: Request) -> Optional[OpenAIServingChat]:
|
250
282
|
return request.app.state.openai_serving_chat
|
251
283
|
|
252
284
|
|
253
|
-
def completion(request: Request) -> OpenAIServingCompletion:
|
285
|
+
def completion(request: Request) -> Optional[OpenAIServingCompletion]:
|
254
286
|
return request.app.state.openai_serving_completion
|
255
287
|
|
256
288
|
|
257
|
-
def
|
258
|
-
return request.app.state.
|
289
|
+
def pooling(request: Request) -> Optional[OpenAIServingPooling]:
|
290
|
+
return request.app.state.openai_serving_pooling
|
259
291
|
|
260
292
|
|
261
|
-
def embedding(request: Request) -> OpenAIServingEmbedding:
|
293
|
+
def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
|
262
294
|
return request.app.state.openai_serving_embedding
|
263
295
|
|
264
296
|
|
297
|
+
def score(request: Request) -> Optional[OpenAIServingScores]:
|
298
|
+
return request.app.state.openai_serving_scores
|
299
|
+
|
300
|
+
|
301
|
+
def tokenization(request: Request) -> OpenAIServingTokenization:
|
302
|
+
return request.app.state.openai_serving_tokenization
|
303
|
+
|
304
|
+
|
265
305
|
def engine_client(request: Request) -> EngineClient:
|
266
306
|
return request.app.state.engine_client
|
267
307
|
|
@@ -274,8 +314,11 @@ async def health(raw_request: Request) -> Response:
|
|
274
314
|
|
275
315
|
|
276
316
|
@router.post("/tokenize")
|
317
|
+
@with_cancellation
|
277
318
|
async def tokenize(request: TokenizeRequest, raw_request: Request):
|
278
|
-
|
319
|
+
handler = tokenization(raw_request)
|
320
|
+
|
321
|
+
generator = await handler.create_tokenize(request, raw_request)
|
279
322
|
if isinstance(generator, ErrorResponse):
|
280
323
|
return JSONResponse(content=generator.model_dump(),
|
281
324
|
status_code=generator.code)
|
@@ -286,8 +329,11 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
|
|
286
329
|
|
287
330
|
|
288
331
|
@router.post("/detokenize")
|
332
|
+
@with_cancellation
|
289
333
|
async def detokenize(request: DetokenizeRequest, raw_request: Request):
|
290
|
-
|
334
|
+
handler = tokenization(raw_request)
|
335
|
+
|
336
|
+
generator = await handler.create_detokenize(request, raw_request)
|
291
337
|
if isinstance(generator, ErrorResponse):
|
292
338
|
return JSONResponse(content=generator.model_dump(),
|
293
339
|
status_code=generator.code)
|
@@ -299,7 +345,9 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
|
|
299
345
|
|
300
346
|
@router.get("/v1/models")
|
301
347
|
async def show_available_models(raw_request: Request):
|
302
|
-
|
348
|
+
handler = base(raw_request)
|
349
|
+
|
350
|
+
models = await handler.show_available_models()
|
303
351
|
return JSONResponse(content=models.model_dump())
|
304
352
|
|
305
353
|
|
@@ -310,11 +358,15 @@ async def show_version():
|
|
310
358
|
|
311
359
|
|
312
360
|
@router.post("/v1/chat/completions")
|
361
|
+
@with_cancellation
|
313
362
|
async def create_chat_completion(request: ChatCompletionRequest,
|
314
363
|
raw_request: Request):
|
364
|
+
handler = chat(raw_request)
|
365
|
+
if handler is None:
|
366
|
+
return base(raw_request).create_error_response(
|
367
|
+
message="The model does not support Chat Completions API")
|
315
368
|
|
316
|
-
generator = await
|
317
|
-
request, raw_request)
|
369
|
+
generator = await handler.create_chat_completion(request, raw_request)
|
318
370
|
|
319
371
|
if isinstance(generator, ErrorResponse):
|
320
372
|
return JSONResponse(content=generator.model_dump(),
|
@@ -327,9 +379,14 @@ async def create_chat_completion(request: ChatCompletionRequest,
|
|
327
379
|
|
328
380
|
|
329
381
|
@router.post("/v1/completions")
|
382
|
+
@with_cancellation
|
330
383
|
async def create_completion(request: CompletionRequest, raw_request: Request):
|
331
|
-
|
332
|
-
|
384
|
+
handler = completion(raw_request)
|
385
|
+
if handler is None:
|
386
|
+
return base(raw_request).create_error_response(
|
387
|
+
message="The model does not support Completions API")
|
388
|
+
|
389
|
+
generator = await handler.create_completion(request, raw_request)
|
333
390
|
if isinstance(generator, ErrorResponse):
|
334
391
|
return JSONResponse(content=generator.model_dump(),
|
335
392
|
status_code=generator.code)
|
@@ -340,9 +397,40 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
|
|
340
397
|
|
341
398
|
|
342
399
|
@router.post("/v1/embeddings")
|
400
|
+
@with_cancellation
|
343
401
|
async def create_embedding(request: EmbeddingRequest, raw_request: Request):
|
344
|
-
|
345
|
-
|
402
|
+
handler = embedding(raw_request)
|
403
|
+
if handler is None:
|
404
|
+
fallback_handler = pooling(raw_request)
|
405
|
+
if fallback_handler is None:
|
406
|
+
return base(raw_request).create_error_response(
|
407
|
+
message="The model does not support Embeddings API")
|
408
|
+
|
409
|
+
logger.warning(
|
410
|
+
"Embeddings API will become exclusive to embedding models "
|
411
|
+
"in a future release. To return the hidden states directly, "
|
412
|
+
"use the Pooling API (`/pooling`) instead.")
|
413
|
+
|
414
|
+
res = await fallback_handler.create_pooling(request, raw_request)
|
415
|
+
if isinstance(res, PoolingResponse):
|
416
|
+
generator = EmbeddingResponse(
|
417
|
+
id=res.id,
|
418
|
+
object=res.object,
|
419
|
+
created=res.created,
|
420
|
+
model=res.model,
|
421
|
+
data=[
|
422
|
+
EmbeddingResponseData(
|
423
|
+
index=d.index,
|
424
|
+
embedding=d.data, # type: ignore
|
425
|
+
) for d in res.data
|
426
|
+
],
|
427
|
+
usage=res.usage,
|
428
|
+
)
|
429
|
+
else:
|
430
|
+
generator = res
|
431
|
+
else:
|
432
|
+
generator = await handler.create_embedding(request, raw_request)
|
433
|
+
|
346
434
|
if isinstance(generator, ErrorResponse):
|
347
435
|
return JSONResponse(content=generator.model_dump(),
|
348
436
|
status_code=generator.code)
|
@@ -352,6 +440,52 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
|
|
352
440
|
assert_never(generator)
|
353
441
|
|
354
442
|
|
443
|
+
@router.post("/pooling")
|
444
|
+
@with_cancellation
|
445
|
+
async def create_pooling(request: PoolingRequest, raw_request: Request):
|
446
|
+
handler = pooling(raw_request)
|
447
|
+
if handler is None:
|
448
|
+
return base(raw_request).create_error_response(
|
449
|
+
message="The model does not support Pooling API")
|
450
|
+
|
451
|
+
generator = await handler.create_pooling(request, raw_request)
|
452
|
+
if isinstance(generator, ErrorResponse):
|
453
|
+
return JSONResponse(content=generator.model_dump(),
|
454
|
+
status_code=generator.code)
|
455
|
+
elif isinstance(generator, PoolingResponse):
|
456
|
+
return JSONResponse(content=generator.model_dump())
|
457
|
+
|
458
|
+
assert_never(generator)
|
459
|
+
|
460
|
+
|
461
|
+
@router.post("/score")
|
462
|
+
@with_cancellation
|
463
|
+
async def create_score(request: ScoreRequest, raw_request: Request):
|
464
|
+
handler = score(raw_request)
|
465
|
+
if handler is None:
|
466
|
+
return base(raw_request).create_error_response(
|
467
|
+
message="The model does not support Score API")
|
468
|
+
|
469
|
+
generator = await handler.create_score(request, raw_request)
|
470
|
+
if isinstance(generator, ErrorResponse):
|
471
|
+
return JSONResponse(content=generator.model_dump(),
|
472
|
+
status_code=generator.code)
|
473
|
+
elif isinstance(generator, ScoreResponse):
|
474
|
+
return JSONResponse(content=generator.model_dump())
|
475
|
+
|
476
|
+
assert_never(generator)
|
477
|
+
|
478
|
+
|
479
|
+
@router.post("/v1/score")
|
480
|
+
@with_cancellation
|
481
|
+
async def create_score_v1(request: ScoreRequest, raw_request: Request):
|
482
|
+
logger.warning(
|
483
|
+
"To indicate that Score API is not part of standard OpenAI API, we "
|
484
|
+
"have moved it to `/score`. Please update your client accordingly.")
|
485
|
+
|
486
|
+
return await create_score(request, raw_request)
|
487
|
+
|
488
|
+
|
355
489
|
if envs.VLLM_TORCH_PROFILER_DIR:
|
356
490
|
logger.warning(
|
357
491
|
"Torch Profiler is enabled in the API server. This should ONLY be "
|
@@ -380,30 +514,26 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
|
|
380
514
|
@router.post("/v1/load_lora_adapter")
|
381
515
|
async def load_lora_adapter(request: LoadLoraAdapterRequest,
|
382
516
|
raw_request: Request):
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
return JSONResponse(content=response.model_dump(),
|
391
|
-
status_code=response.code)
|
517
|
+
for route in [chat, completion, embedding]:
|
518
|
+
handler = route(raw_request)
|
519
|
+
if handler is not None:
|
520
|
+
response = await handler.load_lora_adapter(request)
|
521
|
+
if isinstance(response, ErrorResponse):
|
522
|
+
return JSONResponse(content=response.model_dump(),
|
523
|
+
status_code=response.code)
|
392
524
|
|
393
525
|
return Response(status_code=200, content=response)
|
394
526
|
|
395
527
|
@router.post("/v1/unload_lora_adapter")
|
396
528
|
async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
|
397
529
|
raw_request: Request):
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
return JSONResponse(content=response.model_dump(),
|
406
|
-
status_code=response.code)
|
530
|
+
for route in [chat, completion, embedding]:
|
531
|
+
handler = route(raw_request)
|
532
|
+
if handler is not None:
|
533
|
+
response = await handler.unload_lora_adapter(request)
|
534
|
+
if isinstance(response, ErrorResponse):
|
535
|
+
return JSONResponse(content=response.model_dump(),
|
536
|
+
status_code=response.code)
|
407
537
|
|
408
538
|
return Response(status_code=200, content=response)
|
409
539
|
|
@@ -431,8 +561,9 @@ def build_app(args: Namespace) -> FastAPI:
|
|
431
561
|
|
432
562
|
@app.exception_handler(RequestValidationError)
|
433
563
|
async def validation_exception_handler(_, exc):
|
434
|
-
|
435
|
-
|
564
|
+
err = ErrorResponse(message=str(exc),
|
565
|
+
type="BadRequestError",
|
566
|
+
code=HTTPStatus.BAD_REQUEST)
|
436
567
|
return JSONResponse(err.model_dump(),
|
437
568
|
status_code=HTTPStatus.BAD_REQUEST)
|
438
569
|
|
@@ -440,16 +571,31 @@ def build_app(args: Namespace) -> FastAPI:
|
|
440
571
|
|
441
572
|
@app.middleware("http")
|
442
573
|
async def authentication(request: Request, call_next):
|
443
|
-
root_path = "" if args.root_path is None else args.root_path
|
444
574
|
if request.method == "OPTIONS":
|
445
575
|
return await call_next(request)
|
446
|
-
|
576
|
+
url_path = request.url.path
|
577
|
+
if app.root_path and url_path.startswith(app.root_path):
|
578
|
+
url_path = url_path[len(app.root_path):]
|
579
|
+
if not url_path.startswith("/v1"):
|
447
580
|
return await call_next(request)
|
448
581
|
if request.headers.get("Authorization") != "Bearer " + token:
|
449
582
|
return JSONResponse(content={"error": "Unauthorized"},
|
450
583
|
status_code=401)
|
451
584
|
return await call_next(request)
|
452
585
|
|
586
|
+
if args.enable_request_id_headers:
|
587
|
+
logger.warning(
|
588
|
+
"CAUTION: Enabling X-Request-Id headers in the API Server. "
|
589
|
+
"This can harm performance at high QPS.")
|
590
|
+
|
591
|
+
@app.middleware("http")
|
592
|
+
async def add_request_id(request: Request, call_next):
|
593
|
+
request_id = request.headers.get(
|
594
|
+
"X-Request-Id") or uuid.uuid4().hex
|
595
|
+
response = await call_next(request)
|
596
|
+
response.headers["X-Request-Id"] = request_id
|
597
|
+
return response
|
598
|
+
|
453
599
|
for middleware in args.middleware:
|
454
600
|
module_path, object_name = middleware.rsplit(".", 1)
|
455
601
|
imported = getattr(importlib.import_module(module_path), object_name)
|
@@ -488,49 +634,179 @@ def init_app_state(
|
|
488
634
|
state.engine_client = engine_client
|
489
635
|
state.log_stats = not args.disable_log_stats
|
490
636
|
|
637
|
+
resolved_chat_template = load_chat_template(args.chat_template)
|
638
|
+
logger.info("Using supplied chat template:\n%s", resolved_chat_template)
|
639
|
+
|
640
|
+
state.openai_serving_models = OpenAIServingModels(
|
641
|
+
model_config=model_config,
|
642
|
+
base_model_paths=base_model_paths,
|
643
|
+
lora_modules=args.lora_modules,
|
644
|
+
prompt_adapters=args.prompt_adapters,
|
645
|
+
)
|
646
|
+
# TODO: The chat template is now broken for lora adapters :(
|
491
647
|
state.openai_serving_chat = OpenAIServingChat(
|
492
648
|
engine_client,
|
493
649
|
model_config,
|
494
|
-
|
650
|
+
state.openai_serving_models,
|
495
651
|
args.response_role,
|
496
|
-
lora_modules=args.lora_modules,
|
497
|
-
prompt_adapters=args.prompt_adapters,
|
498
652
|
request_logger=request_logger,
|
499
|
-
chat_template=
|
653
|
+
chat_template=resolved_chat_template,
|
654
|
+
chat_template_content_format=args.chat_template_content_format,
|
500
655
|
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
501
656
|
enable_auto_tools=args.enable_auto_tool_choice,
|
502
|
-
tool_parser=args.tool_call_parser
|
657
|
+
tool_parser=args.tool_call_parser,
|
658
|
+
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
659
|
+
) if model_config.runner_type == "generate" else None
|
503
660
|
state.openai_serving_completion = OpenAIServingCompletion(
|
504
661
|
engine_client,
|
505
662
|
model_config,
|
506
|
-
|
507
|
-
lora_modules=args.lora_modules,
|
508
|
-
prompt_adapters=args.prompt_adapters,
|
663
|
+
state.openai_serving_models,
|
509
664
|
request_logger=request_logger,
|
510
665
|
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
511
|
-
)
|
666
|
+
) if model_config.runner_type == "generate" else None
|
667
|
+
state.openai_serving_pooling = OpenAIServingPooling(
|
668
|
+
engine_client,
|
669
|
+
model_config,
|
670
|
+
state.openai_serving_models,
|
671
|
+
request_logger=request_logger,
|
672
|
+
chat_template=resolved_chat_template,
|
673
|
+
chat_template_content_format=args.chat_template_content_format,
|
674
|
+
) if model_config.runner_type == "pooling" else None
|
512
675
|
state.openai_serving_embedding = OpenAIServingEmbedding(
|
513
676
|
engine_client,
|
514
677
|
model_config,
|
515
|
-
|
678
|
+
state.openai_serving_models,
|
516
679
|
request_logger=request_logger,
|
517
|
-
|
680
|
+
chat_template=resolved_chat_template,
|
681
|
+
chat_template_content_format=args.chat_template_content_format,
|
682
|
+
) if model_config.task == "embed" else None
|
683
|
+
state.openai_serving_scores = OpenAIServingScores(
|
684
|
+
engine_client,
|
685
|
+
model_config,
|
686
|
+
state.openai_serving_models,
|
687
|
+
request_logger=request_logger
|
688
|
+
) if model_config.task == "score" else None
|
518
689
|
state.openai_serving_tokenization = OpenAIServingTokenization(
|
519
690
|
engine_client,
|
520
691
|
model_config,
|
521
|
-
|
522
|
-
lora_modules=args.lora_modules,
|
692
|
+
state.openai_serving_models,
|
523
693
|
request_logger=request_logger,
|
524
|
-
chat_template=
|
694
|
+
chat_template=resolved_chat_template,
|
695
|
+
chat_template_content_format=args.chat_template_content_format,
|
525
696
|
)
|
697
|
+
state.task = model_config.task
|
698
|
+
# if args.served_model_name is not None:
|
699
|
+
# served_model_names = args.served_model_name
|
700
|
+
# else:
|
701
|
+
# served_model_names = [args.model]
|
702
|
+
|
703
|
+
# if args.disable_log_requests:
|
704
|
+
# request_logger = None
|
705
|
+
# else:
|
706
|
+
# request_logger = RequestLogger(max_log_len=args.max_log_len)
|
707
|
+
|
708
|
+
# base_model_paths = [
|
709
|
+
# BaseModelPath(name=name, model_path=args.model)
|
710
|
+
# for name in served_model_names
|
711
|
+
# ]
|
712
|
+
|
713
|
+
# state.engine_client = engine_client
|
714
|
+
# state.log_stats = not args.disable_log_stats
|
715
|
+
|
716
|
+
# resolved_chat_template = load_chat_template(args.chat_template)
|
717
|
+
# logger.info("Using supplied chat template:\n%s", resolved_chat_template)
|
718
|
+
|
719
|
+
# state.openai_serving_chat = OpenAIServingChat(
|
720
|
+
# engine_client,
|
721
|
+
# model_config,
|
722
|
+
# base_model_paths,
|
723
|
+
# args.response_role,
|
724
|
+
# lora_modules=args.lora_modules,
|
725
|
+
# prompt_adapters=args.prompt_adapters,
|
726
|
+
# request_logger=request_logger,
|
727
|
+
# chat_template=resolved_chat_template,
|
728
|
+
# chat_template_content_format=args.chat_template_content_format,
|
729
|
+
# return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
730
|
+
# enable_auto_tools=args.enable_auto_tool_choice,
|
731
|
+
# tool_parser=args.tool_call_parser,
|
732
|
+
# enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
733
|
+
# ) if model_config.runner_type == "generate" else None
|
734
|
+
# state.openai_serving_completion = OpenAIServingCompletion(
|
735
|
+
# engine_client,
|
736
|
+
# model_config,
|
737
|
+
# base_model_paths,
|
738
|
+
# lora_modules=args.lora_modules,
|
739
|
+
# prompt_adapters=args.prompt_adapters,
|
740
|
+
# request_logger=request_logger,
|
741
|
+
# return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
742
|
+
# ) if model_config.runner_type == "generate" else None
|
743
|
+
# state.openai_serving_pooling = OpenAIServingPooling(
|
744
|
+
# engine_client,
|
745
|
+
# model_config,
|
746
|
+
# base_model_paths,
|
747
|
+
# request_logger=request_logger,
|
748
|
+
# chat_template=resolved_chat_template,
|
749
|
+
# chat_template_content_format=args.chat_template_content_format,
|
750
|
+
# ) if model_config.runner_type == "pooling" else None
|
751
|
+
# state.openai_serving_embedding = OpenAIServingEmbedding(
|
752
|
+
# engine_client,
|
753
|
+
# model_config,
|
754
|
+
# base_model_paths,
|
755
|
+
# request_logger=request_logger,
|
756
|
+
# chat_template=resolved_chat_template,
|
757
|
+
# chat_template_content_format=args.chat_template_content_format,
|
758
|
+
# ) if model_config.task == "embed" else None
|
759
|
+
# state.openai_serving_scores = OpenAIServingScores(
|
760
|
+
# engine_client,
|
761
|
+
# model_config,
|
762
|
+
# base_model_paths,
|
763
|
+
# request_logger=request_logger
|
764
|
+
# ) if model_config.task == "score" else None
|
765
|
+
# state.openai_serving_tokenization = OpenAIServingTokenization(
|
766
|
+
# engine_client,
|
767
|
+
# model_config,
|
768
|
+
# base_model_paths,
|
769
|
+
# lora_modules=args.lora_modules,
|
770
|
+
# request_logger=request_logger,
|
771
|
+
# chat_template=resolved_chat_template,
|
772
|
+
# chat_template_content_format=args.chat_template_content_format,
|
773
|
+
# )
|
774
|
+
|
775
|
+
|
776
|
+
def create_server_socket(addr: Tuple[str, int]) -> socket.socket:
|
777
|
+
family = socket.AF_INET
|
778
|
+
if is_valid_ipv6_address(addr[0]):
|
779
|
+
family = socket.AF_INET6
|
780
|
+
|
781
|
+
sock = socket.socket(family=family, type=socket.SOCK_STREAM)
|
782
|
+
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
783
|
+
sock.bind(addr)
|
784
|
+
|
785
|
+
return sock
|
526
786
|
|
527
787
|
|
528
788
|
async def run_server(args, **uvicorn_kwargs) -> None:
|
529
789
|
logger.info("vLLM API server version %s", VLLM_VERSION)
|
530
790
|
logger.info("args: %s", args)
|
531
791
|
|
532
|
-
|
533
|
-
|
792
|
+
if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
|
793
|
+
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
|
794
|
+
|
795
|
+
valide_tool_parses = ToolParserManager.tool_parsers.keys()
|
796
|
+
if args.enable_auto_tool_choice \
|
797
|
+
and args.tool_call_parser not in valide_tool_parses:
|
798
|
+
raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
|
799
|
+
f"(chose from {{ {','.join(valide_tool_parses)} }})")
|
800
|
+
|
801
|
+
# workaround to make sure that we bind the port before the engine is set up.
|
802
|
+
# This avoids race conditions with ray.
|
803
|
+
# see https://github.com/vllm-project/vllm/issues/8204
|
804
|
+
sock_addr = (args.host or "", args.port)
|
805
|
+
sock = create_server_socket(sock_addr)
|
806
|
+
|
807
|
+
# workaround to avoid footguns where uvicorn drops requests with too
|
808
|
+
# many concurrent requests active
|
809
|
+
set_ulimit()
|
534
810
|
|
535
811
|
def signal_handler(*_) -> None:
|
536
812
|
# Interrupt server on sigterm while initializing
|
@@ -544,8 +820,6 @@ async def run_server(args, **uvicorn_kwargs) -> None:
|
|
544
820
|
model_config = await engine_client.get_model_config()
|
545
821
|
init_app_state(engine_client, model_config, app.state, args)
|
546
822
|
|
547
|
-
temp_socket.close()
|
548
|
-
|
549
823
|
shutdown_task = await serve_http(
|
550
824
|
app,
|
551
825
|
host=args.host,
|
@@ -562,13 +836,23 @@ async def run_server(args, **uvicorn_kwargs) -> None:
|
|
562
836
|
# NB: Await server shutdown only after the backend context is exited
|
563
837
|
await shutdown_task
|
564
838
|
|
839
|
+
sock.close()
|
840
|
+
|
565
841
|
|
566
842
|
if __name__ == "__main__":
|
567
843
|
# NOTE(simon):
|
568
844
|
# This section should be in sync with vllm/scripts.py for CLI entrypoints.
|
845
|
+
logger.warning("Warning: Please use `ipex_llm.vllm.xpu.entrypoints.openai.api_server` "
|
846
|
+
"instead of `vllm.entrypoints.openai.api_server` to start the API server")
|
569
847
|
parser = FlexibleArgumentParser(
|
570
848
|
description="vLLM OpenAI-Compatible RESTful API server.")
|
571
849
|
parser = make_arg_parser(parser)
|
850
|
+
parser.add_argument(
|
851
|
+
"--load-in-low-bit",
|
852
|
+
type=str,
|
853
|
+
default="sym_int4",
|
854
|
+
help="Low-bit quantization for IPEX-LLM models")
|
572
855
|
args = parser.parse_args()
|
856
|
+
validate_parsed_serve_args(args)
|
573
857
|
|
574
858
|
uvloop.run(run_server(args))
|