ipex-llm 2.2.0b20250211__py3-none-win_amd64.whl → 2.2.0b20250212__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/convert.py +4 -3
  31. ipex_llm/transformers/models/janus.py +49 -0
  32. ipex_llm/transformers/models/utils.py +1 -1
  33. ipex_llm/vllm/xpu/engine/engine.py +117 -20
  34. ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +379 -95
  35. ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py +57 -8
  36. ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py +23 -0
  37. ipex_llm/vllm/xpu/model_convert.py +25 -19
  38. {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/METADATA +19 -19
  39. {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/RECORD +45 -43
  40. {ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250212.data}/scripts/ipex-llm-init.bat +0 -0
  41. {ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250212.data}/scripts/llm-chat.ps1 +0 -0
  42. {ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250212.data}/scripts/llm-cli.ps1 +0 -0
  43. {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/WHEEL +0 -0
  44. {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/entry_points.txt +0 -0
  45. {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import atexit
2
3
  import importlib
3
4
  import inspect
4
5
  import multiprocessing
@@ -7,11 +8,12 @@ import re
7
8
  import signal
8
9
  import socket
9
10
  import tempfile
11
+ import uuid
10
12
  from argparse import Namespace
11
13
  from contextlib import asynccontextmanager
12
14
  from functools import partial
13
15
  from http import HTTPStatus
14
- from typing import AsyncIterator, Set
16
+ from typing import AsyncIterator, Optional, Set, Tuple
15
17
 
16
18
  import uvloop
17
19
  from fastapi import APIRouter, FastAPI, Request
@@ -29,9 +31,13 @@ from ipex_llm.vllm.xpu.engine import IPEXLLMAsyncLLMEngine as AsyncLLMEngine
29
31
  from vllm.engine.multiprocessing.client import MQLLMEngineClient
30
32
  from ipex_llm.vllm.xpu.engine import run_mp_engine
31
33
  from vllm.engine.protocol import EngineClient
34
+ from vllm.entrypoints.chat_utils import load_chat_template
32
35
  from vllm.entrypoints.launcher import serve_http
33
36
  from vllm.entrypoints.logger import RequestLogger
34
- from ipex_llm.vllm.xpu.entrypoints.openai.cli_args import make_arg_parser
37
+ from vllm.entrypoints.openai.cli_args import (make_arg_parser,
38
+ validate_parsed_serve_args)
39
+
40
+ # from ipex_llm.vllm.xpu.entrypoints.openai.cli_args import make_arg_parser
35
41
  # yapf conflicts with isort for this block
36
42
  # yapf: disable
37
43
  from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
@@ -41,8 +47,12 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
41
47
  DetokenizeRequest,
42
48
  DetokenizeResponse,
43
49
  EmbeddingRequest,
44
- EmbeddingResponse, ErrorResponse,
50
+ EmbeddingResponse,
51
+ EmbeddingResponseData,
52
+ ErrorResponse,
45
53
  LoadLoraAdapterRequest,
54
+ PoolingRequest, PoolingResponse,
55
+ ScoreRequest, ScoreResponse,
46
56
  TokenizeRequest,
47
57
  TokenizeResponse,
48
58
  UnloadLoraAdapterRequest)
@@ -50,12 +60,20 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
50
60
  from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
51
61
  from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
52
62
  from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
53
- from vllm.entrypoints.openai.serving_engine import BaseModelPath
63
+ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
64
+ OpenAIServingModels)
65
+
66
+ from vllm.entrypoints.openai.serving_engine import OpenAIServing
67
+ from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
68
+ from vllm.entrypoints.openai.serving_score import OpenAIServingScores
54
69
  from vllm.entrypoints.openai.serving_tokenization import (
55
70
  OpenAIServingTokenization)
71
+ from vllm.entrypoints.openai.tool_parsers import ToolParserManager
72
+ from vllm.entrypoints.utils import with_cancellation
56
73
  from vllm.logger import init_logger
57
74
  from vllm.usage.usage_lib import UsageContext
58
- from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path
75
+ from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
76
+ is_valid_ipv6_address, set_ulimit)
59
77
  from vllm.version import __version__ as VLLM_VERSION
60
78
 
61
79
  TIMEOUT_KEEP_ALIVE = 5 # seconds
@@ -111,7 +129,7 @@ async def build_async_engine_client(
111
129
  async def build_async_engine_client_from_engine_args(
112
130
  engine_args: AsyncEngineArgs,
113
131
  disable_frontend_multiprocessing: bool = False,
114
- load_in_low_bit: str = 'sym_int4',
132
+ load_in_low_bit: str = "sym_int4",
115
133
  ) -> AsyncIterator[EngineClient]:
116
134
  """
117
135
  Create EngineClient, either:
@@ -124,25 +142,19 @@ async def build_async_engine_client_from_engine_args(
124
142
  # Fall back
125
143
  # TODO: fill out feature matrix.
126
144
  if (MQLLMEngineClient.is_unsupported_config(engine_args)
127
- or disable_frontend_multiprocessing):
128
- engine_config = engine_args.create_engine_config()
129
- uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
130
- "uses_ray", False)
131
-
132
- build_engine = partial(AsyncLLMEngine.from_engine_args,
133
- engine_args=engine_args,
134
- load_in_low_bit=load_in_low_bit,
135
- engine_config=engine_config,
136
- usage_context=UsageContext.OPENAI_API_SERVER)
137
- if uses_ray:
138
- # Must run in main thread with ray for its signal handlers to work
139
- engine_client = build_engine()
140
- else:
141
- engine_client = await asyncio.get_running_loop().run_in_executor(
142
- None, build_engine)
143
-
144
- yield engine_client
145
- return
145
+ or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
146
+ engine_client: Optional[EngineClient] = None
147
+ try:
148
+ # When starting this, we are actually starting with the V1Engine
149
+ # Here we are doing a classification, we will need to do this in IPEX-LLM
150
+ engine_client = AsyncLLMEngine.from_engine_args(
151
+ engine_args=engine_args,
152
+ usage_context=UsageContext.OPENAI_API_SERVER,
153
+ load_in_low_bit=load_in_low_bit)
154
+ yield engine_client
155
+ finally:
156
+ if engine_client and hasattr(engine_client, "shutdown"):
157
+ engine_client.shutdown()
146
158
 
147
159
  # Otherwise, use the multiprocessing AsyncLLMEngine.
148
160
  else:
@@ -163,45 +175,60 @@ async def build_async_engine_client_from_engine_args(
163
175
 
164
176
  # Select random path for IPC.
165
177
  ipc_path = get_open_zmq_ipc_path()
166
- logger.info("Multiprocessing frontend to use %s for IPC Path.",
167
- ipc_path)
178
+ logger.debug("Multiprocessing frontend to use %s for IPC Path.",
179
+ ipc_path)
168
180
 
169
181
  # Start RPCServer in separate process (holds the LLMEngine).
170
182
  # the current process might have CUDA context,
171
183
  # so we need to spawn a new process
172
184
  context = multiprocessing.get_context("spawn")
173
185
 
186
+ # The Process can raise an exception during startup, which may
187
+ # not actually result in an exitcode being reported. As a result
188
+ # we use a shared variable to communicate the information.
189
+ engine_alive = multiprocessing.Value('b', True, lock=False)
174
190
  engine_process = context.Process(target=run_mp_engine,
175
191
  args=(engine_args,
176
192
  UsageContext.OPENAI_API_SERVER,
177
- ipc_path,
178
- load_in_low_bit))
193
+ ipc_path, load_in_low_bit, engine_alive))
179
194
  engine_process.start()
180
- logger.info("Started engine process with PID %d", engine_process.pid)
195
+ engine_pid = engine_process.pid
196
+ assert engine_pid is not None, "Engine process failed to start."
197
+ logger.info("Started engine process with PID %d", engine_pid)
198
+
199
+ def _cleanup_ipc_path():
200
+ socket_path = ipc_path.replace("ipc://", "")
201
+ if os.path.exists(socket_path):
202
+ os.remove(socket_path)
203
+
204
+ # Ensure we clean up the local IPC socket file on exit.
205
+ atexit.register(_cleanup_ipc_path)
181
206
 
182
207
  # Build RPCClient, which conforms to EngineClient Protocol.
183
- # NOTE: Actually, this is not true yet. We still need to support
184
- # embedding models via RPC (see TODO above)
185
208
  engine_config = engine_args.create_engine_config()
186
- mp_engine_client = MQLLMEngineClient(ipc_path, engine_config)
187
-
209
+ build_client = partial(MQLLMEngineClient, ipc_path, engine_config,
210
+ engine_pid)
211
+ mq_engine_client = await asyncio.get_running_loop().run_in_executor(
212
+ None, build_client)
188
213
  try:
189
214
  while True:
190
215
  try:
191
- await mp_engine_client.setup()
216
+ await mq_engine_client.setup()
192
217
  break
193
218
  except TimeoutError:
194
- if not engine_process.is_alive():
219
+ if (not engine_process.is_alive()
220
+ or not engine_alive.value):
195
221
  raise RuntimeError(
196
- "Engine process failed to start") from None
222
+ "Engine process failed to start. See stack "
223
+ "trace for the root cause.") from None
197
224
 
198
- yield mp_engine_client # type: ignore[misc]
225
+ yield mq_engine_client # type: ignore[misc]
199
226
  finally:
200
227
  # Ensure rpc server process was terminated
201
228
  engine_process.terminate()
202
229
 
203
230
  # Close all open connections to the backend
204
- mp_engine_client.close()
231
+ mq_engine_client.close()
205
232
 
206
233
  # Wait for engine process to join
207
234
  engine_process.join(4)
@@ -230,8 +257,8 @@ def mount_metrics(app: FastAPI):
230
257
 
231
258
  prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
232
259
  if prometheus_multiproc_dir_path is not None:
233
- logger.info("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
234
- prometheus_multiproc_dir_path)
260
+ logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
261
+ prometheus_multiproc_dir_path)
235
262
  registry = CollectorRegistry()
236
263
  multiprocess.MultiProcessCollector(registry)
237
264
 
@@ -246,22 +273,35 @@ def mount_metrics(app: FastAPI):
246
273
  app.routes.append(metrics_route)
247
274
 
248
275
 
249
- def chat(request: Request) -> OpenAIServingChat:
276
+ def base(request: Request) -> OpenAIServing:
277
+ # Reuse the existing instance
278
+ return tokenization(request)
279
+
280
+
281
+ def chat(request: Request) -> Optional[OpenAIServingChat]:
250
282
  return request.app.state.openai_serving_chat
251
283
 
252
284
 
253
- def completion(request: Request) -> OpenAIServingCompletion:
285
+ def completion(request: Request) -> Optional[OpenAIServingCompletion]:
254
286
  return request.app.state.openai_serving_completion
255
287
 
256
288
 
257
- def tokenization(request: Request) -> OpenAIServingTokenization:
258
- return request.app.state.openai_serving_tokenization
289
+ def pooling(request: Request) -> Optional[OpenAIServingPooling]:
290
+ return request.app.state.openai_serving_pooling
259
291
 
260
292
 
261
- def embedding(request: Request) -> OpenAIServingEmbedding:
293
+ def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
262
294
  return request.app.state.openai_serving_embedding
263
295
 
264
296
 
297
+ def score(request: Request) -> Optional[OpenAIServingScores]:
298
+ return request.app.state.openai_serving_scores
299
+
300
+
301
+ def tokenization(request: Request) -> OpenAIServingTokenization:
302
+ return request.app.state.openai_serving_tokenization
303
+
304
+
265
305
  def engine_client(request: Request) -> EngineClient:
266
306
  return request.app.state.engine_client
267
307
 
@@ -274,8 +314,11 @@ async def health(raw_request: Request) -> Response:
274
314
 
275
315
 
276
316
  @router.post("/tokenize")
317
+ @with_cancellation
277
318
  async def tokenize(request: TokenizeRequest, raw_request: Request):
278
- generator = await tokenization(raw_request).create_tokenize(request)
319
+ handler = tokenization(raw_request)
320
+
321
+ generator = await handler.create_tokenize(request, raw_request)
279
322
  if isinstance(generator, ErrorResponse):
280
323
  return JSONResponse(content=generator.model_dump(),
281
324
  status_code=generator.code)
@@ -286,8 +329,11 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
286
329
 
287
330
 
288
331
  @router.post("/detokenize")
332
+ @with_cancellation
289
333
  async def detokenize(request: DetokenizeRequest, raw_request: Request):
290
- generator = await tokenization(raw_request).create_detokenize(request)
334
+ handler = tokenization(raw_request)
335
+
336
+ generator = await handler.create_detokenize(request, raw_request)
291
337
  if isinstance(generator, ErrorResponse):
292
338
  return JSONResponse(content=generator.model_dump(),
293
339
  status_code=generator.code)
@@ -299,7 +345,9 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
299
345
 
300
346
  @router.get("/v1/models")
301
347
  async def show_available_models(raw_request: Request):
302
- models = await completion(raw_request).show_available_models()
348
+ handler = base(raw_request)
349
+
350
+ models = await handler.show_available_models()
303
351
  return JSONResponse(content=models.model_dump())
304
352
 
305
353
 
@@ -310,11 +358,15 @@ async def show_version():
310
358
 
311
359
 
312
360
  @router.post("/v1/chat/completions")
361
+ @with_cancellation
313
362
  async def create_chat_completion(request: ChatCompletionRequest,
314
363
  raw_request: Request):
364
+ handler = chat(raw_request)
365
+ if handler is None:
366
+ return base(raw_request).create_error_response(
367
+ message="The model does not support Chat Completions API")
315
368
 
316
- generator = await chat(raw_request).create_chat_completion(
317
- request, raw_request)
369
+ generator = await handler.create_chat_completion(request, raw_request)
318
370
 
319
371
  if isinstance(generator, ErrorResponse):
320
372
  return JSONResponse(content=generator.model_dump(),
@@ -327,9 +379,14 @@ async def create_chat_completion(request: ChatCompletionRequest,
327
379
 
328
380
 
329
381
  @router.post("/v1/completions")
382
+ @with_cancellation
330
383
  async def create_completion(request: CompletionRequest, raw_request: Request):
331
- generator = await completion(raw_request).create_completion(
332
- request, raw_request)
384
+ handler = completion(raw_request)
385
+ if handler is None:
386
+ return base(raw_request).create_error_response(
387
+ message="The model does not support Completions API")
388
+
389
+ generator = await handler.create_completion(request, raw_request)
333
390
  if isinstance(generator, ErrorResponse):
334
391
  return JSONResponse(content=generator.model_dump(),
335
392
  status_code=generator.code)
@@ -340,9 +397,40 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
340
397
 
341
398
 
342
399
  @router.post("/v1/embeddings")
400
+ @with_cancellation
343
401
  async def create_embedding(request: EmbeddingRequest, raw_request: Request):
344
- generator = await embedding(raw_request).create_embedding(
345
- request, raw_request)
402
+ handler = embedding(raw_request)
403
+ if handler is None:
404
+ fallback_handler = pooling(raw_request)
405
+ if fallback_handler is None:
406
+ return base(raw_request).create_error_response(
407
+ message="The model does not support Embeddings API")
408
+
409
+ logger.warning(
410
+ "Embeddings API will become exclusive to embedding models "
411
+ "in a future release. To return the hidden states directly, "
412
+ "use the Pooling API (`/pooling`) instead.")
413
+
414
+ res = await fallback_handler.create_pooling(request, raw_request)
415
+ if isinstance(res, PoolingResponse):
416
+ generator = EmbeddingResponse(
417
+ id=res.id,
418
+ object=res.object,
419
+ created=res.created,
420
+ model=res.model,
421
+ data=[
422
+ EmbeddingResponseData(
423
+ index=d.index,
424
+ embedding=d.data, # type: ignore
425
+ ) for d in res.data
426
+ ],
427
+ usage=res.usage,
428
+ )
429
+ else:
430
+ generator = res
431
+ else:
432
+ generator = await handler.create_embedding(request, raw_request)
433
+
346
434
  if isinstance(generator, ErrorResponse):
347
435
  return JSONResponse(content=generator.model_dump(),
348
436
  status_code=generator.code)
@@ -352,6 +440,52 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
352
440
  assert_never(generator)
353
441
 
354
442
 
443
+ @router.post("/pooling")
444
+ @with_cancellation
445
+ async def create_pooling(request: PoolingRequest, raw_request: Request):
446
+ handler = pooling(raw_request)
447
+ if handler is None:
448
+ return base(raw_request).create_error_response(
449
+ message="The model does not support Pooling API")
450
+
451
+ generator = await handler.create_pooling(request, raw_request)
452
+ if isinstance(generator, ErrorResponse):
453
+ return JSONResponse(content=generator.model_dump(),
454
+ status_code=generator.code)
455
+ elif isinstance(generator, PoolingResponse):
456
+ return JSONResponse(content=generator.model_dump())
457
+
458
+ assert_never(generator)
459
+
460
+
461
+ @router.post("/score")
462
+ @with_cancellation
463
+ async def create_score(request: ScoreRequest, raw_request: Request):
464
+ handler = score(raw_request)
465
+ if handler is None:
466
+ return base(raw_request).create_error_response(
467
+ message="The model does not support Score API")
468
+
469
+ generator = await handler.create_score(request, raw_request)
470
+ if isinstance(generator, ErrorResponse):
471
+ return JSONResponse(content=generator.model_dump(),
472
+ status_code=generator.code)
473
+ elif isinstance(generator, ScoreResponse):
474
+ return JSONResponse(content=generator.model_dump())
475
+
476
+ assert_never(generator)
477
+
478
+
479
+ @router.post("/v1/score")
480
+ @with_cancellation
481
+ async def create_score_v1(request: ScoreRequest, raw_request: Request):
482
+ logger.warning(
483
+ "To indicate that Score API is not part of standard OpenAI API, we "
484
+ "have moved it to `/score`. Please update your client accordingly.")
485
+
486
+ return await create_score(request, raw_request)
487
+
488
+
355
489
  if envs.VLLM_TORCH_PROFILER_DIR:
356
490
  logger.warning(
357
491
  "Torch Profiler is enabled in the API server. This should ONLY be "
@@ -380,30 +514,26 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
380
514
  @router.post("/v1/load_lora_adapter")
381
515
  async def load_lora_adapter(request: LoadLoraAdapterRequest,
382
516
  raw_request: Request):
383
- response = await chat(raw_request).load_lora_adapter(request)
384
- if isinstance(response, ErrorResponse):
385
- return JSONResponse(content=response.model_dump(),
386
- status_code=response.code)
387
-
388
- response = await completion(raw_request).load_lora_adapter(request)
389
- if isinstance(response, ErrorResponse):
390
- return JSONResponse(content=response.model_dump(),
391
- status_code=response.code)
517
+ for route in [chat, completion, embedding]:
518
+ handler = route(raw_request)
519
+ if handler is not None:
520
+ response = await handler.load_lora_adapter(request)
521
+ if isinstance(response, ErrorResponse):
522
+ return JSONResponse(content=response.model_dump(),
523
+ status_code=response.code)
392
524
 
393
525
  return Response(status_code=200, content=response)
394
526
 
395
527
  @router.post("/v1/unload_lora_adapter")
396
528
  async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
397
529
  raw_request: Request):
398
- response = await chat(raw_request).unload_lora_adapter(request)
399
- if isinstance(response, ErrorResponse):
400
- return JSONResponse(content=response.model_dump(),
401
- status_code=response.code)
402
-
403
- response = await completion(raw_request).unload_lora_adapter(request)
404
- if isinstance(response, ErrorResponse):
405
- return JSONResponse(content=response.model_dump(),
406
- status_code=response.code)
530
+ for route in [chat, completion, embedding]:
531
+ handler = route(raw_request)
532
+ if handler is not None:
533
+ response = await handler.unload_lora_adapter(request)
534
+ if isinstance(response, ErrorResponse):
535
+ return JSONResponse(content=response.model_dump(),
536
+ status_code=response.code)
407
537
 
408
538
  return Response(status_code=200, content=response)
409
539
 
@@ -431,8 +561,9 @@ def build_app(args: Namespace) -> FastAPI:
431
561
 
432
562
  @app.exception_handler(RequestValidationError)
433
563
  async def validation_exception_handler(_, exc):
434
- chat = app.state.openai_serving_chat
435
- err = chat.create_error_response(message=str(exc))
564
+ err = ErrorResponse(message=str(exc),
565
+ type="BadRequestError",
566
+ code=HTTPStatus.BAD_REQUEST)
436
567
  return JSONResponse(err.model_dump(),
437
568
  status_code=HTTPStatus.BAD_REQUEST)
438
569
 
@@ -440,16 +571,31 @@ def build_app(args: Namespace) -> FastAPI:
440
571
 
441
572
  @app.middleware("http")
442
573
  async def authentication(request: Request, call_next):
443
- root_path = "" if args.root_path is None else args.root_path
444
574
  if request.method == "OPTIONS":
445
575
  return await call_next(request)
446
- if not request.url.path.startswith(f"{root_path}/v1"):
576
+ url_path = request.url.path
577
+ if app.root_path and url_path.startswith(app.root_path):
578
+ url_path = url_path[len(app.root_path):]
579
+ if not url_path.startswith("/v1"):
447
580
  return await call_next(request)
448
581
  if request.headers.get("Authorization") != "Bearer " + token:
449
582
  return JSONResponse(content={"error": "Unauthorized"},
450
583
  status_code=401)
451
584
  return await call_next(request)
452
585
 
586
+ if args.enable_request_id_headers:
587
+ logger.warning(
588
+ "CAUTION: Enabling X-Request-Id headers in the API Server. "
589
+ "This can harm performance at high QPS.")
590
+
591
+ @app.middleware("http")
592
+ async def add_request_id(request: Request, call_next):
593
+ request_id = request.headers.get(
594
+ "X-Request-Id") or uuid.uuid4().hex
595
+ response = await call_next(request)
596
+ response.headers["X-Request-Id"] = request_id
597
+ return response
598
+
453
599
  for middleware in args.middleware:
454
600
  module_path, object_name = middleware.rsplit(".", 1)
455
601
  imported = getattr(importlib.import_module(module_path), object_name)
@@ -488,49 +634,179 @@ def init_app_state(
488
634
  state.engine_client = engine_client
489
635
  state.log_stats = not args.disable_log_stats
490
636
 
637
+ resolved_chat_template = load_chat_template(args.chat_template)
638
+ logger.info("Using supplied chat template:\n%s", resolved_chat_template)
639
+
640
+ state.openai_serving_models = OpenAIServingModels(
641
+ model_config=model_config,
642
+ base_model_paths=base_model_paths,
643
+ lora_modules=args.lora_modules,
644
+ prompt_adapters=args.prompt_adapters,
645
+ )
646
+ # TODO: The chat template is now broken for lora adapters :(
491
647
  state.openai_serving_chat = OpenAIServingChat(
492
648
  engine_client,
493
649
  model_config,
494
- base_model_paths,
650
+ state.openai_serving_models,
495
651
  args.response_role,
496
- lora_modules=args.lora_modules,
497
- prompt_adapters=args.prompt_adapters,
498
652
  request_logger=request_logger,
499
- chat_template=args.chat_template,
653
+ chat_template=resolved_chat_template,
654
+ chat_template_content_format=args.chat_template_content_format,
500
655
  return_tokens_as_token_ids=args.return_tokens_as_token_ids,
501
656
  enable_auto_tools=args.enable_auto_tool_choice,
502
- tool_parser=args.tool_call_parser)
657
+ tool_parser=args.tool_call_parser,
658
+ enable_prompt_tokens_details=args.enable_prompt_tokens_details,
659
+ ) if model_config.runner_type == "generate" else None
503
660
  state.openai_serving_completion = OpenAIServingCompletion(
504
661
  engine_client,
505
662
  model_config,
506
- base_model_paths,
507
- lora_modules=args.lora_modules,
508
- prompt_adapters=args.prompt_adapters,
663
+ state.openai_serving_models,
509
664
  request_logger=request_logger,
510
665
  return_tokens_as_token_ids=args.return_tokens_as_token_ids,
511
- )
666
+ ) if model_config.runner_type == "generate" else None
667
+ state.openai_serving_pooling = OpenAIServingPooling(
668
+ engine_client,
669
+ model_config,
670
+ state.openai_serving_models,
671
+ request_logger=request_logger,
672
+ chat_template=resolved_chat_template,
673
+ chat_template_content_format=args.chat_template_content_format,
674
+ ) if model_config.runner_type == "pooling" else None
512
675
  state.openai_serving_embedding = OpenAIServingEmbedding(
513
676
  engine_client,
514
677
  model_config,
515
- base_model_paths,
678
+ state.openai_serving_models,
516
679
  request_logger=request_logger,
517
- )
680
+ chat_template=resolved_chat_template,
681
+ chat_template_content_format=args.chat_template_content_format,
682
+ ) if model_config.task == "embed" else None
683
+ state.openai_serving_scores = OpenAIServingScores(
684
+ engine_client,
685
+ model_config,
686
+ state.openai_serving_models,
687
+ request_logger=request_logger
688
+ ) if model_config.task == "score" else None
518
689
  state.openai_serving_tokenization = OpenAIServingTokenization(
519
690
  engine_client,
520
691
  model_config,
521
- base_model_paths,
522
- lora_modules=args.lora_modules,
692
+ state.openai_serving_models,
523
693
  request_logger=request_logger,
524
- chat_template=args.chat_template,
694
+ chat_template=resolved_chat_template,
695
+ chat_template_content_format=args.chat_template_content_format,
525
696
  )
697
+ state.task = model_config.task
698
+ # if args.served_model_name is not None:
699
+ # served_model_names = args.served_model_name
700
+ # else:
701
+ # served_model_names = [args.model]
702
+
703
+ # if args.disable_log_requests:
704
+ # request_logger = None
705
+ # else:
706
+ # request_logger = RequestLogger(max_log_len=args.max_log_len)
707
+
708
+ # base_model_paths = [
709
+ # BaseModelPath(name=name, model_path=args.model)
710
+ # for name in served_model_names
711
+ # ]
712
+
713
+ # state.engine_client = engine_client
714
+ # state.log_stats = not args.disable_log_stats
715
+
716
+ # resolved_chat_template = load_chat_template(args.chat_template)
717
+ # logger.info("Using supplied chat template:\n%s", resolved_chat_template)
718
+
719
+ # state.openai_serving_chat = OpenAIServingChat(
720
+ # engine_client,
721
+ # model_config,
722
+ # base_model_paths,
723
+ # args.response_role,
724
+ # lora_modules=args.lora_modules,
725
+ # prompt_adapters=args.prompt_adapters,
726
+ # request_logger=request_logger,
727
+ # chat_template=resolved_chat_template,
728
+ # chat_template_content_format=args.chat_template_content_format,
729
+ # return_tokens_as_token_ids=args.return_tokens_as_token_ids,
730
+ # enable_auto_tools=args.enable_auto_tool_choice,
731
+ # tool_parser=args.tool_call_parser,
732
+ # enable_prompt_tokens_details=args.enable_prompt_tokens_details,
733
+ # ) if model_config.runner_type == "generate" else None
734
+ # state.openai_serving_completion = OpenAIServingCompletion(
735
+ # engine_client,
736
+ # model_config,
737
+ # base_model_paths,
738
+ # lora_modules=args.lora_modules,
739
+ # prompt_adapters=args.prompt_adapters,
740
+ # request_logger=request_logger,
741
+ # return_tokens_as_token_ids=args.return_tokens_as_token_ids,
742
+ # ) if model_config.runner_type == "generate" else None
743
+ # state.openai_serving_pooling = OpenAIServingPooling(
744
+ # engine_client,
745
+ # model_config,
746
+ # base_model_paths,
747
+ # request_logger=request_logger,
748
+ # chat_template=resolved_chat_template,
749
+ # chat_template_content_format=args.chat_template_content_format,
750
+ # ) if model_config.runner_type == "pooling" else None
751
+ # state.openai_serving_embedding = OpenAIServingEmbedding(
752
+ # engine_client,
753
+ # model_config,
754
+ # base_model_paths,
755
+ # request_logger=request_logger,
756
+ # chat_template=resolved_chat_template,
757
+ # chat_template_content_format=args.chat_template_content_format,
758
+ # ) if model_config.task == "embed" else None
759
+ # state.openai_serving_scores = OpenAIServingScores(
760
+ # engine_client,
761
+ # model_config,
762
+ # base_model_paths,
763
+ # request_logger=request_logger
764
+ # ) if model_config.task == "score" else None
765
+ # state.openai_serving_tokenization = OpenAIServingTokenization(
766
+ # engine_client,
767
+ # model_config,
768
+ # base_model_paths,
769
+ # lora_modules=args.lora_modules,
770
+ # request_logger=request_logger,
771
+ # chat_template=resolved_chat_template,
772
+ # chat_template_content_format=args.chat_template_content_format,
773
+ # )
774
+
775
+
776
+ def create_server_socket(addr: Tuple[str, int]) -> socket.socket:
777
+ family = socket.AF_INET
778
+ if is_valid_ipv6_address(addr[0]):
779
+ family = socket.AF_INET6
780
+
781
+ sock = socket.socket(family=family, type=socket.SOCK_STREAM)
782
+ sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
783
+ sock.bind(addr)
784
+
785
+ return sock
526
786
 
527
787
 
528
788
  async def run_server(args, **uvicorn_kwargs) -> None:
529
789
  logger.info("vLLM API server version %s", VLLM_VERSION)
530
790
  logger.info("args: %s", args)
531
791
 
532
- temp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
533
- temp_socket.bind(("", args.port))
792
+ if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
793
+ ToolParserManager.import_tool_parser(args.tool_parser_plugin)
794
+
795
+ valide_tool_parses = ToolParserManager.tool_parsers.keys()
796
+ if args.enable_auto_tool_choice \
797
+ and args.tool_call_parser not in valide_tool_parses:
798
+ raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
799
+ f"(chose from {{ {','.join(valide_tool_parses)} }})")
800
+
801
+ # workaround to make sure that we bind the port before the engine is set up.
802
+ # This avoids race conditions with ray.
803
+ # see https://github.com/vllm-project/vllm/issues/8204
804
+ sock_addr = (args.host or "", args.port)
805
+ sock = create_server_socket(sock_addr)
806
+
807
+ # workaround to avoid footguns where uvicorn drops requests with too
808
+ # many concurrent requests active
809
+ set_ulimit()
534
810
 
535
811
  def signal_handler(*_) -> None:
536
812
  # Interrupt server on sigterm while initializing
@@ -544,8 +820,6 @@ async def run_server(args, **uvicorn_kwargs) -> None:
544
820
  model_config = await engine_client.get_model_config()
545
821
  init_app_state(engine_client, model_config, app.state, args)
546
822
 
547
- temp_socket.close()
548
-
549
823
  shutdown_task = await serve_http(
550
824
  app,
551
825
  host=args.host,
@@ -562,13 +836,23 @@ async def run_server(args, **uvicorn_kwargs) -> None:
562
836
  # NB: Await server shutdown only after the backend context is exited
563
837
  await shutdown_task
564
838
 
839
+ sock.close()
840
+
565
841
 
566
842
  if __name__ == "__main__":
567
843
  # NOTE(simon):
568
844
  # This section should be in sync with vllm/scripts.py for CLI entrypoints.
845
+ logger.warning("Warning: Please use `ipex_llm.vllm.xpu.entrypoints.openai.api_server` "
846
+ "instead of `vllm.entrypoints.openai.api_server` to start the API server")
569
847
  parser = FlexibleArgumentParser(
570
848
  description="vLLM OpenAI-Compatible RESTful API server.")
571
849
  parser = make_arg_parser(parser)
850
+ parser.add_argument(
851
+ "--load-in-low-bit",
852
+ type=str,
853
+ default="sym_int4",
854
+ help="Low-bit quantization for IPEX-LLM models")
572
855
  args = parser.parse_args()
856
+ validate_parsed_serve_args(args)
573
857
 
574
858
  uvloop.run(run_server(args))