sglang 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. sglang/bench_one_batch_server.py +17 -2
  2. sglang/bench_serving.py +168 -22
  3. sglang/srt/configs/internvl.py +4 -2
  4. sglang/srt/configs/janus_pro.py +1 -1
  5. sglang/srt/configs/model_config.py +49 -0
  6. sglang/srt/configs/update_config.py +119 -0
  7. sglang/srt/conversation.py +35 -0
  8. sglang/srt/custom_op.py +7 -1
  9. sglang/srt/disaggregation/base/conn.py +2 -0
  10. sglang/srt/disaggregation/decode.py +22 -6
  11. sglang/srt/disaggregation/mooncake/conn.py +289 -48
  12. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
  13. sglang/srt/disaggregation/nixl/conn.py +100 -52
  14. sglang/srt/disaggregation/prefill.py +5 -4
  15. sglang/srt/disaggregation/utils.py +13 -12
  16. sglang/srt/distributed/parallel_state.py +44 -17
  17. sglang/srt/entrypoints/EngineBase.py +8 -0
  18. sglang/srt/entrypoints/engine.py +45 -9
  19. sglang/srt/entrypoints/http_server.py +111 -24
  20. sglang/srt/entrypoints/openai/protocol.py +51 -6
  21. sglang/srt/entrypoints/openai/serving_chat.py +52 -76
  22. sglang/srt/entrypoints/openai/serving_completions.py +1 -0
  23. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  24. sglang/srt/eplb/__init__.py +0 -0
  25. sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
  26. sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
  27. sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
  28. sglang/srt/{managers → eplb}/expert_distribution.py +18 -1
  29. sglang/srt/{managers → eplb}/expert_location.py +1 -1
  30. sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
  31. sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
  32. sglang/srt/hf_transformers_utils.py +2 -1
  33. sglang/srt/layers/activation.py +7 -0
  34. sglang/srt/layers/amx_utils.py +86 -0
  35. sglang/srt/layers/attention/ascend_backend.py +219 -0
  36. sglang/srt/layers/attention/flashattention_backend.py +56 -23
  37. sglang/srt/layers/attention/tbo_backend.py +37 -9
  38. sglang/srt/layers/communicator.py +18 -2
  39. sglang/srt/layers/dp_attention.py +9 -3
  40. sglang/srt/layers/elementwise.py +76 -12
  41. sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
  42. sglang/srt/layers/layernorm.py +41 -0
  43. sglang/srt/layers/linear.py +99 -12
  44. sglang/srt/layers/logits_processor.py +15 -6
  45. sglang/srt/layers/moe/ep_moe/kernels.py +23 -8
  46. sglang/srt/layers/moe/ep_moe/layer.py +115 -25
  47. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +42 -19
  48. sglang/srt/layers/moe/fused_moe_native.py +7 -0
  49. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -4
  50. sglang/srt/layers/moe/fused_moe_triton/layer.py +129 -10
  51. sglang/srt/layers/moe/router.py +60 -22
  52. sglang/srt/layers/moe/topk.py +36 -28
  53. sglang/srt/layers/parameter.py +67 -7
  54. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
  55. sglang/srt/layers/quantization/fp8.py +44 -0
  56. sglang/srt/layers/quantization/fp8_kernel.py +1 -1
  57. sglang/srt/layers/quantization/fp8_utils.py +6 -6
  58. sglang/srt/layers/quantization/gptq.py +5 -1
  59. sglang/srt/layers/quantization/moe_wna16.py +1 -1
  60. sglang/srt/layers/quantization/quant_utils.py +166 -0
  61. sglang/srt/layers/quantization/w8a8_int8.py +52 -1
  62. sglang/srt/layers/rotary_embedding.py +105 -13
  63. sglang/srt/layers/vocab_parallel_embedding.py +19 -2
  64. sglang/srt/lora/lora.py +4 -5
  65. sglang/srt/lora/lora_manager.py +73 -20
  66. sglang/srt/managers/configure_logging.py +1 -1
  67. sglang/srt/managers/io_struct.py +60 -15
  68. sglang/srt/managers/mm_utils.py +73 -59
  69. sglang/srt/managers/multimodal_processor.py +2 -6
  70. sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
  71. sglang/srt/managers/schedule_batch.py +80 -79
  72. sglang/srt/managers/scheduler.py +153 -63
  73. sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
  74. sglang/srt/managers/session_controller.py +12 -3
  75. sglang/srt/managers/tokenizer_manager.py +314 -103
  76. sglang/srt/managers/tp_worker.py +13 -1
  77. sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
  78. sglang/srt/mem_cache/allocator.py +290 -0
  79. sglang/srt/mem_cache/chunk_cache.py +34 -2
  80. sglang/srt/mem_cache/memory_pool.py +289 -3
  81. sglang/srt/mem_cache/multimodal_cache.py +3 -0
  82. sglang/srt/model_executor/cuda_graph_runner.py +3 -2
  83. sglang/srt/model_executor/forward_batch_info.py +17 -4
  84. sglang/srt/model_executor/model_runner.py +302 -58
  85. sglang/srt/model_loader/loader.py +86 -10
  86. sglang/srt/model_loader/weight_utils.py +160 -3
  87. sglang/srt/models/deepseek_nextn.py +5 -4
  88. sglang/srt/models/deepseek_v2.py +305 -26
  89. sglang/srt/models/deepseek_vl2.py +3 -5
  90. sglang/srt/models/gemma3_causal.py +1 -2
  91. sglang/srt/models/gemma3n_audio.py +949 -0
  92. sglang/srt/models/gemma3n_causal.py +1010 -0
  93. sglang/srt/models/gemma3n_mm.py +495 -0
  94. sglang/srt/models/hunyuan.py +771 -0
  95. sglang/srt/models/kimi_vl.py +1 -2
  96. sglang/srt/models/llama.py +10 -4
  97. sglang/srt/models/llama4.py +32 -45
  98. sglang/srt/models/llama_eagle3.py +61 -11
  99. sglang/srt/models/llava.py +5 -5
  100. sglang/srt/models/minicpmo.py +2 -2
  101. sglang/srt/models/mistral.py +1 -1
  102. sglang/srt/models/mllama4.py +43 -11
  103. sglang/srt/models/phi4mm.py +1 -3
  104. sglang/srt/models/pixtral.py +3 -7
  105. sglang/srt/models/qwen2.py +31 -3
  106. sglang/srt/models/qwen2_5_vl.py +1 -3
  107. sglang/srt/models/qwen2_audio.py +200 -0
  108. sglang/srt/models/qwen2_moe.py +32 -6
  109. sglang/srt/models/qwen2_vl.py +1 -4
  110. sglang/srt/models/qwen3.py +94 -25
  111. sglang/srt/models/qwen3_moe.py +68 -21
  112. sglang/srt/models/vila.py +3 -8
  113. sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +150 -133
  114. sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
  115. sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
  116. sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
  117. sglang/srt/multimodal/processors/gemma3n.py +82 -0
  118. sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
  119. sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
  120. sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
  121. sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
  122. sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
  123. sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
  124. sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +3 -6
  125. sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
  126. sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
  127. sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
  128. sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
  129. sglang/srt/operations_strategy.py +6 -2
  130. sglang/srt/reasoning_parser.py +26 -0
  131. sglang/srt/sampling/sampling_batch_info.py +39 -1
  132. sglang/srt/server_args.py +85 -24
  133. sglang/srt/speculative/build_eagle_tree.py +57 -18
  134. sglang/srt/speculative/eagle_worker.py +6 -4
  135. sglang/srt/two_batch_overlap.py +204 -28
  136. sglang/srt/utils.py +369 -138
  137. sglang/srt/warmup.py +12 -3
  138. sglang/test/runners.py +10 -1
  139. sglang/test/test_utils.py +15 -3
  140. sglang/version.py +1 -1
  141. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/METADATA +9 -6
  142. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/RECORD +149 -137
  143. sglang/math_utils.py +0 -8
  144. /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
  145. /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
  146. /sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
  147. /sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +0 -0
  148. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/WHEEL +0 -0
  149. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/licenses/LICENSE +0 -0
  150. {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/top_level.txt +0 -0
@@ -48,10 +48,12 @@ from sglang.srt.managers.io_struct import (
48
48
  GetWeightsByNameReqInput,
49
49
  ImageDataItem,
50
50
  InitWeightsUpdateGroupReqInput,
51
+ LoadLoRAAdapterReqInput,
51
52
  ReleaseMemoryOccupationReqInput,
52
53
  ResumeMemoryOccupationReqInput,
53
54
  RpcReqInput,
54
55
  RpcReqOutput,
56
+ UnloadLoRAAdapterReqInput,
55
57
  UpdateWeightFromDiskReqInput,
56
58
  UpdateWeightsFromDistributedReqInput,
57
59
  UpdateWeightsFromTensorReqInput,
@@ -115,13 +117,13 @@ class Engine(EngineBase):
115
117
  atexit.register(self.shutdown)
116
118
 
117
119
  # Allocate ports for inter-process communications
118
- port_args = PortArgs.init_new(server_args)
120
+ self.port_args = PortArgs.init_new(server_args)
119
121
  logger.info(f"{server_args=}")
120
122
 
121
123
  # Launch subprocesses
122
124
  tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
123
125
  server_args=server_args,
124
- port_args=port_args,
126
+ port_args=self.port_args,
125
127
  )
126
128
  self.server_args = server_args
127
129
  self.tokenizer_manager = tokenizer_manager
@@ -130,7 +132,7 @@ class Engine(EngineBase):
130
132
 
131
133
  context = zmq.Context(2)
132
134
  self.send_to_rpc = get_zmq_socket(
133
- context, zmq.DEALER, port_args.rpc_ipc_name, True
135
+ context, zmq.DEALER, self.port_args.rpc_ipc_name, True
134
136
  )
135
137
 
136
138
  def generate(
@@ -242,6 +244,7 @@ class Engine(EngineBase):
242
244
  token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
243
245
  lora_path: Optional[List[Optional[str]]] = None,
244
246
  custom_logit_processor: Optional[Union[List[str], str]] = None,
247
+ return_hidden_states: bool = False,
245
248
  stream: bool = False,
246
249
  bootstrap_host: Optional[Union[List[str], str]] = None,
247
250
  bootstrap_port: Optional[Union[List[int], int]] = None,
@@ -274,6 +277,7 @@ class Engine(EngineBase):
274
277
  top_logprobs_num=top_logprobs_num,
275
278
  token_ids_logprob=token_ids_logprob,
276
279
  lora_path=lora_path,
280
+ return_hidden_states=return_hidden_states,
277
281
  stream=stream,
278
282
  custom_logit_processor=custom_logit_processor,
279
283
  bootstrap_host=bootstrap_host,
@@ -414,12 +418,21 @@ class Engine(EngineBase):
414
418
  self.tokenizer_manager.init_weights_update_group(obj, None)
415
419
  )
416
420
 
417
- def update_weights_from_distributed(self, name: str, dtype, shape):
421
+ def update_weights_from_distributed(
422
+ self,
423
+ names: list[str],
424
+ dtypes: list[str],
425
+ shapes: list[list[int]],
426
+ group_name: str = "weight_update_group",
427
+ flush_cache: bool = True,
428
+ ):
418
429
  """Update weights from distributed source."""
419
430
  obj = UpdateWeightsFromDistributedReqInput(
420
- name=name,
421
- dtype=dtype,
422
- shape=shape,
431
+ names=names,
432
+ dtypes=dtypes,
433
+ shapes=shapes,
434
+ group_name=group_name,
435
+ flush_cache=flush_cache,
423
436
  )
424
437
  loop = asyncio.get_event_loop()
425
438
  return loop.run_until_complete(
@@ -476,6 +489,29 @@ class Engine(EngineBase):
476
489
  self.tokenizer_manager.get_weights_by_name(obj, None)
477
490
  )
478
491
 
492
+ def load_lora_adapter(self, lora_name: str, lora_path: str):
493
+ """Load a new LoRA adapter without re-launching the engine."""
494
+
495
+ obj = LoadLoRAAdapterReqInput(
496
+ lora_name=lora_name,
497
+ lora_path=lora_path,
498
+ )
499
+
500
+ loop = asyncio.get_event_loop()
501
+ return loop.run_until_complete(
502
+ self.tokenizer_manager.load_lora_adapter(obj, None)
503
+ )
504
+
505
+ def unload_lora_adapter(self, lora_name: str):
506
+ """Unload a LoRA adapter without re-launching the engine."""
507
+
508
+ obj = UnloadLoRAAdapterReqInput(lora_name=lora_name)
509
+
510
+ loop = asyncio.get_event_loop()
511
+ return loop.run_until_complete(
512
+ self.tokenizer_manager.unload_lora_adapter(obj, None)
513
+ )
514
+
479
515
  def release_memory_occupation(self, tags: Optional[List[str]] = None):
480
516
  obj = ReleaseMemoryOccupationReqInput(tags=tags)
481
517
  loop = asyncio.get_event_loop()
@@ -606,7 +642,7 @@ def _set_envs_and_config(server_args: ServerArgs):
606
642
  if server_args.attention_backend == "flashinfer":
607
643
  assert_pkg_version(
608
644
  "flashinfer_python",
609
- "0.2.6.post1",
645
+ "0.2.7.post1",
610
646
  "Please uninstall the old version and "
611
647
  "reinstall the latest version by following the instructions "
612
648
  "at https://docs.flashinfer.ai/installation.html.",
@@ -614,7 +650,7 @@ def _set_envs_and_config(server_args: ServerArgs):
614
650
  if _is_cuda:
615
651
  assert_pkg_version(
616
652
  "sgl-kernel",
617
- "0.1.9",
653
+ "0.2.4",
618
654
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
619
655
  )
620
656
 
@@ -72,6 +72,7 @@ from sglang.srt.managers.io_struct import (
72
72
  GenerateReqInput,
73
73
  GetWeightsByNameReqInput,
74
74
  InitWeightsUpdateGroupReqInput,
75
+ LoadLoRAAdapterReqInput,
75
76
  OpenSessionReqInput,
76
77
  ParseFunctionCallReq,
77
78
  ProfileReqInput,
@@ -80,6 +81,7 @@ from sglang.srt.managers.io_struct import (
80
81
  SeparateReasoningReqInput,
81
82
  SetInternalStateReq,
82
83
  SlowDownReqInput,
84
+ UnloadLoRAAdapterReqInput,
83
85
  UpdateWeightFromDiskReqInput,
84
86
  UpdateWeightsFromDistributedReqInput,
85
87
  UpdateWeightsFromTensorReqInput,
@@ -124,8 +126,6 @@ def set_global_state(global_state: _GlobalState):
124
126
 
125
127
  @asynccontextmanager
126
128
  async def lifespan(fast_api_app: FastAPI):
127
- server_args: ServerArgs = fast_api_app.server_args
128
-
129
129
  # Initialize OpenAI serving handlers
130
130
  fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
131
131
  _global_state.tokenizer_manager, _global_state.template_manager
@@ -143,9 +143,12 @@ async def lifespan(fast_api_app: FastAPI):
143
143
  _global_state.tokenizer_manager
144
144
  )
145
145
 
146
+ server_args: ServerArgs = fast_api_app.server_args
146
147
  if server_args.warmups is not None:
147
148
  await execute_warmups(
148
- server_args.warmups.split(","), _global_state.tokenizer_manager
149
+ server_args.disaggregation_mode,
150
+ server_args.warmups.split(","),
151
+ _global_state.tokenizer_manager,
149
152
  )
150
153
  logger.info("Warmup ended")
151
154
 
@@ -278,13 +281,17 @@ async def get_model_info():
278
281
  "model_path": _global_state.tokenizer_manager.model_path,
279
282
  "tokenizer_path": _global_state.tokenizer_manager.server_args.tokenizer_path,
280
283
  "is_generation": _global_state.tokenizer_manager.is_generation,
284
+ "preferred_sampling_params": _global_state.tokenizer_manager.server_args.preferred_sampling_params,
281
285
  }
282
286
  return result
283
287
 
284
288
 
285
289
  @app.get("/get_server_info")
286
290
  async def get_server_info():
287
- internal_states = await _global_state.tokenizer_manager.get_internal_state()
291
+ # Returns interna states per DP.
292
+ internal_states: List[Dict[Any, Any]] = (
293
+ await _global_state.tokenizer_manager.get_internal_state()
294
+ )
288
295
  return {
289
296
  **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
290
297
  **_global_state.scheduler_info,
@@ -298,6 +305,8 @@ async def get_load():
298
305
  return await _global_state.tokenizer_manager.get_load()
299
306
 
300
307
 
308
+ # example usage:
309
+ # curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
301
310
  @app.api_route("/set_internal_state", methods=["POST", "PUT"])
302
311
  async def set_internal_state(obj: SetInternalStateReq, request: Request):
303
312
  res = await _global_state.tokenizer_manager.set_internal_state(obj)
@@ -351,8 +360,7 @@ async def generate_from_file_request(file: UploadFile, request: Request):
351
360
  obj = GenerateReqInput(
352
361
  input_embeds=input_embeds,
353
362
  sampling_params={
354
- "repetition_penalty": 1.2,
355
- "temperature": 0.2,
363
+ "temperature": 0.0,
356
364
  "max_new_tokens": 512,
357
365
  },
358
366
  )
@@ -391,16 +399,6 @@ async def classify_request(obj: EmbeddingReqInput, request: Request):
391
399
  return _create_error_response(e)
392
400
 
393
401
 
394
- @app.api_route(
395
- "/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
396
- )
397
- async def v1_rerank_request(request: V1RerankReqInput, raw_request: Request):
398
- """Endpoint for reranking documents based on query relevance."""
399
- return await raw_request.app.state.openai_serving_rerank.handle_request(
400
- request, raw_request
401
- )
402
-
403
-
404
402
  @app.api_route("/flush_cache", methods=["GET", "POST"])
405
403
  async def flush_cache():
406
404
  """Flush the radix cache."""
@@ -595,6 +593,40 @@ async def slow_down(obj: SlowDownReqInput, request: Request):
595
593
  return _create_error_response(e)
596
594
 
597
595
 
596
+ @app.api_route("/load_lora_adapter", methods=["POST"])
597
+ async def load_lora_adapter(obj: LoadLoRAAdapterReqInput, request: Request):
598
+ """Load a new LoRA adapter without re-launching the server."""
599
+ result = await _global_state.tokenizer_manager.load_lora_adapter(obj, request)
600
+
601
+ if result.success:
602
+ return ORJSONResponse(
603
+ result,
604
+ status_code=HTTPStatus.OK,
605
+ )
606
+ else:
607
+ return ORJSONResponse(
608
+ result,
609
+ status_code=HTTPStatus.BAD_REQUEST,
610
+ )
611
+
612
+
613
+ @app.api_route("/unload_lora_adapter", methods=["POST"])
614
+ async def unload_lora_adapter(obj: UnloadLoRAAdapterReqInput, request: Request):
615
+ """Load a new LoRA adapter without re-launching the server."""
616
+ result = await _global_state.tokenizer_manager.unload_lora_adapter(obj, request)
617
+
618
+ if result.success:
619
+ return ORJSONResponse(
620
+ result,
621
+ status_code=HTTPStatus.OK,
622
+ )
623
+ else:
624
+ return ORJSONResponse(
625
+ result,
626
+ status_code=HTTPStatus.BAD_REQUEST,
627
+ )
628
+
629
+
598
630
  @app.api_route("/open_session", methods=["GET", "POST"])
599
631
  async def open_session(obj: OpenSessionReqInput, request: Request):
600
632
  """Open a session, and return its unique session id."""
@@ -630,7 +662,9 @@ async def configure_logging(obj: ConfigureLoggingReq, request: Request):
630
662
  async def abort_request(obj: AbortReq, request: Request):
631
663
  """Abort a request."""
632
664
  try:
633
- _global_state.tokenizer_manager.abort_request(rid=obj.rid)
665
+ _global_state.tokenizer_manager.abort_request(
666
+ rid=obj.rid, abort_all=obj.abort_all
667
+ )
634
668
  return Response(status_code=200)
635
669
  except Exception as e:
636
670
  return _create_error_response(e)
@@ -678,6 +712,26 @@ async def separate_reasoning_request(obj: SeparateReasoningReqInput, request: Re
678
712
  return ORJSONResponse(content=response_data, status_code=200)
679
713
 
680
714
 
715
+ @app.post("/pause_generation")
716
+ async def pause_generation(request: Request):
717
+ """Pause generation."""
718
+ await _global_state.tokenizer_manager.pause_generation()
719
+ return ORJSONResponse(
720
+ content={"message": "Generation paused successfully.", "status": "ok"},
721
+ status_code=200,
722
+ )
723
+
724
+
725
+ @app.post("/continue_generation")
726
+ async def continue_generation(request: Request):
727
+ """Continue generation."""
728
+ await _global_state.tokenizer_manager.continue_generation()
729
+ return ORJSONResponse(
730
+ content={"message": "Generation continued successfully.", "status": "ok"},
731
+ status_code=200,
732
+ )
733
+
734
+
681
735
  ##### OpenAI-compatible API endpoints #####
682
736
 
683
737
 
@@ -805,6 +859,16 @@ async def v1_score_request(request: ScoringRequest, raw_request: Request):
805
859
  )
806
860
 
807
861
 
862
+ @app.api_route(
863
+ "/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
864
+ )
865
+ async def v1_rerank_request(request: V1RerankReqInput, raw_request: Request):
866
+ """Endpoint for reranking documents based on query relevance."""
867
+ return await raw_request.app.state.openai_serving_rerank.handle_request(
868
+ request, raw_request
869
+ )
870
+
871
+
808
872
  def _create_error_response(e):
809
873
  return ORJSONResponse(
810
874
  {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
@@ -851,6 +915,15 @@ def launch_server(
851
915
  add_prometheus_middleware(app)
852
916
  enable_func_timer()
853
917
 
918
+ image_token_text = None
919
+ if (
920
+ tokenizer_manager.image_token_id is not None
921
+ and not server_args.skip_tokenizer_init
922
+ ):
923
+ image_token_text = tokenizer_manager.tokenizer.decode(
924
+ [tokenizer_manager.image_token_id]
925
+ )
926
+
854
927
  # Send a warmup request - we will create the thread launch it
855
928
  # in the lifespan after all other warmups have fired.
856
929
  warmup_thread = threading.Thread(
@@ -858,7 +931,7 @@ def launch_server(
858
931
  args=(
859
932
  server_args,
860
933
  pipe_finish_writer,
861
- _global_state.tokenizer_manager.image_token_id,
934
+ image_token_text,
862
935
  launch_callback,
863
936
  ),
864
937
  )
@@ -881,11 +954,9 @@ def launch_server(
881
954
  warmup_thread.join()
882
955
 
883
956
 
884
- def _wait_and_warmup(
957
+ def _execute_server_warmup(
885
958
  server_args: ServerArgs,
886
959
  pipe_finish_writer: Optional[multiprocessing.connection.Connection],
887
- image_token_text: str,
888
- launch_callback: Optional[Callable[[], None]] = None,
889
960
  ):
890
961
  headers = {}
891
962
  url = server_args.url()
@@ -910,7 +981,7 @@ def _wait_and_warmup(
910
981
  pipe_finish_writer.send(last_traceback)
911
982
  logger.error(f"Initialization failed. warmup error: {last_traceback}")
912
983
  kill_process_tree(os.getpid())
913
- return
984
+ return success
914
985
 
915
986
  model_info = res.json()
916
987
 
@@ -984,12 +1055,28 @@ def _wait_and_warmup(
984
1055
  pipe_finish_writer.send(last_traceback)
985
1056
  logger.error(f"Initialization failed. warmup error: {last_traceback}")
986
1057
  kill_process_tree(os.getpid())
987
- return
1058
+ return False
988
1059
 
989
1060
  # Debug print
990
- # logger.info(f"{res.json()=}")
1061
+ # logger.info(f"warmup request returns: {res.json()=}")
1062
+ return success
1063
+
1064
+
1065
+ def _wait_and_warmup(
1066
+ server_args: ServerArgs,
1067
+ pipe_finish_writer: Optional[multiprocessing.connection.Connection],
1068
+ image_token_text: str,
1069
+ launch_callback: Optional[Callable[[], None]] = None,
1070
+ ):
1071
+ if not server_args.skip_server_warmup:
1072
+ if not _execute_server_warmup(
1073
+ server_args,
1074
+ pipe_finish_writer,
1075
+ ):
1076
+ return
991
1077
 
992
1078
  logger.info("The server is fired up and ready to roll!")
1079
+
993
1080
  if pipe_finish_writer is not None:
994
1081
  pipe_finish_writer.send("ready")
995
1082
 
@@ -14,7 +14,8 @@
14
14
  """Pydantic models for OpenAI API protocol"""
15
15
 
16
16
  import time
17
- from typing import Dict, List, Optional, Union
17
+ from dataclasses import dataclass
18
+ from typing import Any, Dict, List, Optional, Union
18
19
 
19
20
  from pydantic import (
20
21
  BaseModel,
@@ -195,6 +196,9 @@ class CompletionRequest(BaseModel):
195
196
  bootstrap_port: Optional[int] = None
196
197
  bootstrap_room: Optional[int] = None
197
198
 
199
+ # For request id
200
+ rid: Optional[Union[List[str], str]] = None
201
+
198
202
  @field_validator("max_tokens")
199
203
  @classmethod
200
204
  def validate_max_tokens_positive(cls, v):
@@ -232,7 +236,7 @@ class CompletionResponseStreamChoice(BaseModel):
232
236
  index: int
233
237
  text: str
234
238
  logprobs: Optional[LogProbs] = None
235
- finish_reason: Optional[Literal["stop", "length", "content_filter"]] = None
239
+ finish_reason: Optional[Literal["stop", "length", "content_filter", "abort"]] = None
236
240
  matched_stop: Union[None, int, str] = None
237
241
  hidden_states: Optional[object] = None
238
242
 
@@ -309,6 +313,18 @@ class ChatCompletionMessageGenericParam(BaseModel):
309
313
  reasoning_content: Optional[str] = None
310
314
  tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
311
315
 
316
+ @field_validator("role", mode="before")
317
+ @classmethod
318
+ def _normalize_role(cls, v):
319
+ if isinstance(v, str):
320
+ v_lower = v.lower()
321
+ if v_lower not in {"system", "assistant", "tool"}:
322
+ raise ValueError(
323
+ "'role' must be one of 'system', 'assistant', or 'tool' (case-insensitive)."
324
+ )
325
+ return v_lower
326
+ raise ValueError("'role' must be a string")
327
+
312
328
 
313
329
  class ChatCompletionMessageUserParam(BaseModel):
314
330
  role: Literal["user"]
@@ -429,8 +445,8 @@ class ChatCompletionRequest(BaseModel):
429
445
  stream_reasoning: bool = True
430
446
  chat_template_kwargs: Optional[Dict] = None
431
447
 
432
- # The request id.
433
- rid: Optional[str] = None
448
+ # For request id
449
+ rid: Optional[Union[List[str], str]] = None
434
450
 
435
451
  # For PD disaggregation
436
452
  bootstrap_host: Optional[str] = None
@@ -494,7 +510,9 @@ class ChatCompletionResponseStreamChoice(BaseModel):
494
510
  delta: DeltaMessage
495
511
  logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
496
512
  finish_reason: Optional[
497
- Literal["stop", "length", "tool_calls", "content_filter", "function_call"]
513
+ Literal[
514
+ "stop", "length", "tool_calls", "content_filter", "function_call", "abort"
515
+ ]
498
516
  ] = None
499
517
  matched_stop: Union[None, int, str] = None
500
518
 
@@ -528,7 +546,7 @@ class EmbeddingRequest(BaseModel):
528
546
  user: Optional[str] = None
529
547
 
530
548
  # The request id.
531
- rid: Optional[str] = None
549
+ rid: Optional[Union[List[str], str]] = None
532
550
 
533
551
 
534
552
  class EmbeddingObject(BaseModel):
@@ -587,3 +605,30 @@ OpenAIServingRequest = Union[
587
605
  ScoringRequest,
588
606
  V1RerankReqInput,
589
607
  ]
608
+
609
+
610
+ @dataclass
611
+ class MessageProcessingResult:
612
+ """Result of processing chat messages and applying templates.
613
+
614
+ This dataclass encapsulates all the outputs from message processing including
615
+ prompt generation, multimodal data extraction, and constraint preparation.
616
+ Used internally by OpenAIServingChat to pass processed data between methods.
617
+
618
+ Args:
619
+ prompt: The final text prompt after applying chat template
620
+ prompt_ids: Either the text prompt (str) or tokenized IDs (List[int])
621
+ image_data: Extracted image data from messages, if any
622
+ audio_data: Extracted audio data from messages, if any
623
+ modalities: List of modality types present in the messages
624
+ stop: Combined stop strings from template and request
625
+ tool_call_constraint: Optional constraint for structured tool calls
626
+ """
627
+
628
+ prompt: str
629
+ prompt_ids: Union[str, List[int]]
630
+ image_data: Optional[Any]
631
+ audio_data: Optional[Any]
632
+ modalities: List[str]
633
+ stop: List[str]
634
+ tool_call_constraint: Optional[Any] = None