sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +6 -1
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +8 -7
  6. sglang/srt/disaggregation/decode.py +8 -4
  7. sglang/srt/disaggregation/mooncake/conn.py +43 -25
  8. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  9. sglang/srt/distributed/parallel_state.py +4 -2
  10. sglang/srt/entrypoints/context.py +3 -20
  11. sglang/srt/entrypoints/engine.py +13 -8
  12. sglang/srt/entrypoints/harmony_utils.py +2 -0
  13. sglang/srt/entrypoints/http_server.py +68 -5
  14. sglang/srt/entrypoints/openai/protocol.py +2 -9
  15. sglang/srt/entrypoints/openai/serving_chat.py +60 -265
  16. sglang/srt/entrypoints/openai/serving_completions.py +1 -0
  17. sglang/srt/entrypoints/openai/tool_server.py +4 -3
  18. sglang/srt/function_call/ebnf_composer.py +1 -0
  19. sglang/srt/function_call/function_call_parser.py +2 -0
  20. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  21. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  22. sglang/srt/function_call/kimik2_detector.py +3 -3
  23. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  24. sglang/srt/jinja_template_utils.py +6 -0
  25. sglang/srt/layers/attention/aiter_backend.py +370 -107
  26. sglang/srt/layers/attention/ascend_backend.py +3 -0
  27. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  28. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  29. sglang/srt/layers/attention/flashinfer_backend.py +55 -13
  30. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
  31. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  32. sglang/srt/layers/attention/triton_backend.py +24 -27
  33. sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
  34. sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
  35. sglang/srt/layers/attention/vision.py +9 -1
  36. sglang/srt/layers/attention/wave_backend.py +627 -0
  37. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  38. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  39. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  40. sglang/srt/layers/communicator.py +11 -13
  41. sglang/srt/layers/dp_attention.py +118 -27
  42. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  43. sglang/srt/layers/linear.py +1 -0
  44. sglang/srt/layers/logits_processor.py +12 -18
  45. sglang/srt/layers/moe/cutlass_moe.py +11 -16
  46. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  47. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  48. sglang/srt/layers/moe/ep_moe/layer.py +60 -2
  49. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  50. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  51. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  52. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
  63. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  64. sglang/srt/layers/moe/topk.py +4 -1
  65. sglang/srt/layers/multimodal.py +156 -40
  66. sglang/srt/layers/quantization/__init__.py +10 -35
  67. sglang/srt/layers/quantization/awq.py +15 -16
  68. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
  69. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  70. sglang/srt/layers/quantization/fp8_utils.py +22 -10
  71. sglang/srt/layers/quantization/gptq.py +12 -17
  72. sglang/srt/layers/quantization/marlin_utils.py +15 -5
  73. sglang/srt/layers/quantization/modelopt_quant.py +58 -41
  74. sglang/srt/layers/quantization/mxfp4.py +20 -3
  75. sglang/srt/layers/quantization/utils.py +52 -2
  76. sglang/srt/layers/quantization/w4afp8.py +20 -11
  77. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  78. sglang/srt/layers/rotary_embedding.py +281 -2
  79. sglang/srt/layers/sampler.py +5 -2
  80. sglang/srt/lora/backend/base_backend.py +3 -23
  81. sglang/srt/lora/layers.py +66 -116
  82. sglang/srt/lora/lora.py +17 -62
  83. sglang/srt/lora/lora_manager.py +12 -48
  84. sglang/srt/lora/lora_registry.py +20 -9
  85. sglang/srt/lora/mem_pool.py +20 -63
  86. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  87. sglang/srt/lora/utils.py +25 -58
  88. sglang/srt/managers/cache_controller.py +24 -29
  89. sglang/srt/managers/detokenizer_manager.py +1 -1
  90. sglang/srt/managers/io_struct.py +20 -6
  91. sglang/srt/managers/mm_utils.py +1 -2
  92. sglang/srt/managers/multimodal_processor.py +1 -1
  93. sglang/srt/managers/schedule_batch.py +43 -49
  94. sglang/srt/managers/schedule_policy.py +6 -6
  95. sglang/srt/managers/scheduler.py +18 -11
  96. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  97. sglang/srt/managers/tokenizer_manager.py +53 -44
  98. sglang/srt/mem_cache/allocator.py +39 -214
  99. sglang/srt/mem_cache/allocator_ascend.py +158 -0
  100. sglang/srt/mem_cache/chunk_cache.py +1 -1
  101. sglang/srt/mem_cache/hicache_storage.py +1 -1
  102. sglang/srt/mem_cache/hiradix_cache.py +34 -24
  103. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  104. sglang/srt/mem_cache/memory_pool_host.py +33 -35
  105. sglang/srt/mem_cache/radix_cache.py +2 -5
  106. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  107. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  108. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  109. sglang/srt/model_executor/cuda_graph_runner.py +29 -23
  110. sglang/srt/model_executor/forward_batch_info.py +33 -14
  111. sglang/srt/model_executor/model_runner.py +179 -81
  112. sglang/srt/model_loader/loader.py +18 -6
  113. sglang/srt/models/deepseek_nextn.py +2 -1
  114. sglang/srt/models/deepseek_v2.py +79 -38
  115. sglang/srt/models/gemma2.py +0 -34
  116. sglang/srt/models/gemma3n_mm.py +8 -9
  117. sglang/srt/models/glm4.py +6 -0
  118. sglang/srt/models/glm4_moe.py +11 -11
  119. sglang/srt/models/glm4_moe_nextn.py +2 -1
  120. sglang/srt/models/glm4v.py +589 -0
  121. sglang/srt/models/glm4v_moe.py +400 -0
  122. sglang/srt/models/gpt_oss.py +142 -20
  123. sglang/srt/models/granite.py +0 -25
  124. sglang/srt/models/llama.py +10 -27
  125. sglang/srt/models/llama4.py +19 -6
  126. sglang/srt/models/qwen2.py +2 -2
  127. sglang/srt/models/qwen2_5_vl.py +7 -3
  128. sglang/srt/models/qwen2_audio.py +10 -9
  129. sglang/srt/models/qwen2_moe.py +20 -5
  130. sglang/srt/models/qwen3.py +0 -24
  131. sglang/srt/models/qwen3_classification.py +78 -0
  132. sglang/srt/models/qwen3_moe.py +18 -5
  133. sglang/srt/models/registry.py +1 -1
  134. sglang/srt/models/step3_vl.py +6 -2
  135. sglang/srt/models/torch_native_llama.py +0 -24
  136. sglang/srt/multimodal/processors/base_processor.py +23 -13
  137. sglang/srt/multimodal/processors/glm4v.py +132 -0
  138. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  139. sglang/srt/operations.py +17 -2
  140. sglang/srt/reasoning_parser.py +316 -0
  141. sglang/srt/sampling/sampling_batch_info.py +7 -4
  142. sglang/srt/server_args.py +142 -140
  143. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
  144. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
  145. sglang/srt/speculative/eagle_worker.py +16 -0
  146. sglang/srt/two_batch_overlap.py +16 -12
  147. sglang/srt/utils.py +3 -3
  148. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  149. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  150. sglang/test/doc_patch.py +59 -0
  151. sglang/test/few_shot_gsm8k.py +1 -1
  152. sglang/test/few_shot_gsm8k_engine.py +1 -1
  153. sglang/test/run_eval.py +4 -1
  154. sglang/test/simple_eval_common.py +6 -0
  155. sglang/test/simple_eval_gpqa.py +2 -0
  156. sglang/test/test_fp4_moe.py +118 -36
  157. sglang/test/test_marlin_moe.py +1 -1
  158. sglang/test/test_marlin_utils.py +1 -1
  159. sglang/utils.py +1 -1
  160. sglang/version.py +1 -1
  161. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
  162. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
  163. sglang/lang/backend/__init__.py +0 -0
  164. sglang/srt/function_call/harmony_tool_parser.py +0 -130
  165. sglang/srt/layers/quantization/scalar_type.py +0 -352
  166. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  167. /sglang/{api.py → lang/api.py} +0 -0
  168. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
  169. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
  170. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -26,7 +26,7 @@ import os
26
26
  import threading
27
27
  import time
28
28
  from http import HTTPStatus
29
- from typing import AsyncIterator, Callable, Dict, Optional
29
+ from typing import Any, AsyncIterator, Callable, Dict, List, Optional
30
30
 
31
31
  # Fix a bug of Python threading
32
32
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
@@ -88,6 +88,7 @@ from sglang.srt.managers.io_struct import (
88
88
  UpdateWeightFromDiskReqInput,
89
89
  UpdateWeightsFromDistributedReqInput,
90
90
  UpdateWeightsFromTensorReqInput,
91
+ UpdateWeightVersionReqInput,
91
92
  VertexGenerateReqInput,
92
93
  )
93
94
  from sglang.srt.managers.template_manager import TemplateManager
@@ -174,7 +175,6 @@ async def lifespan(fast_api_app: FastAPI):
174
175
  tool_server=tool_server,
175
176
  )
176
177
  except Exception as e:
177
- # print stack trace
178
178
  import traceback
179
179
 
180
180
  traceback.print_exc()
@@ -277,7 +277,7 @@ async def health_generate(request: Request) -> Response:
277
277
  logger.info("Health check request received during shutdown. Returning 503.")
278
278
  return Response(status_code=503)
279
279
 
280
- if not _global_state.tokenizer_manager.server_status.is_healthy():
280
+ if _global_state.tokenizer_manager.server_status == ServerStatus.Starting:
281
281
  return Response(status_code=503)
282
282
 
283
283
  sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
@@ -317,7 +317,7 @@ async def health_generate(request: Request) -> Response:
317
317
  if _global_state.tokenizer_manager.last_receive_tstamp > tic:
318
318
  task.cancel()
319
319
  _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
320
- _global_state.tokenizer_manager.health_check_failed = False
320
+ _global_state.tokenizer_manager.server_status = ServerStatus.Up
321
321
  return Response(status_code=200)
322
322
 
323
323
  task.cancel()
@@ -331,7 +331,7 @@ async def health_generate(request: Request) -> Response:
331
331
  f"last_heartbeat time: {last_receive_time}"
332
332
  )
333
333
  _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
334
- _global_state.tokenizer_manager.health_check_failed = True
334
+ _global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
335
335
  return Response(status_code=503)
336
336
 
337
337
 
@@ -343,10 +343,19 @@ async def get_model_info():
343
343
  "tokenizer_path": _global_state.tokenizer_manager.server_args.tokenizer_path,
344
344
  "is_generation": _global_state.tokenizer_manager.is_generation,
345
345
  "preferred_sampling_params": _global_state.tokenizer_manager.server_args.preferred_sampling_params,
346
+ "weight_version": _global_state.tokenizer_manager.server_args.weight_version,
346
347
  }
347
348
  return result
348
349
 
349
350
 
351
+ @app.get("/get_weight_version")
352
+ async def get_weight_version():
353
+ """Get the current weight version."""
354
+ return {
355
+ "weight_version": _global_state.tokenizer_manager.server_args.weight_version
356
+ }
357
+
358
+
350
359
  @app.get("/get_server_info")
351
360
  async def get_server_info():
352
361
  # Returns interna states per DP.
@@ -538,6 +547,12 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
538
547
  success, message, num_paused_requests = (
539
548
  await _global_state.tokenizer_manager.update_weights_from_disk(obj, request)
540
549
  )
550
+
551
+ # Update weight version if provided and weights update was successful
552
+ if success and obj.weight_version is not None:
553
+ _update_weight_version_if_provided(obj.weight_version)
554
+ message += f" Weight version updated to {obj.weight_version}."
555
+
541
556
  content = {
542
557
  "success": success,
543
558
  "message": message,
@@ -584,6 +599,12 @@ async def update_weights_from_tensor(
584
599
  success, message = await _global_state.tokenizer_manager.update_weights_from_tensor(
585
600
  obj, request
586
601
  )
602
+
603
+ # Update weight version if provided and weights update was successful
604
+ if success and obj.weight_version is not None:
605
+ _update_weight_version_if_provided(obj.weight_version)
606
+ message += f" Weight version updated to {obj.weight_version}."
607
+
587
608
  content = {"success": success, "message": message}
588
609
  return ORJSONResponse(
589
610
  content, status_code=200 if success else HTTPStatus.BAD_REQUEST
@@ -600,6 +621,12 @@ async def update_weights_from_distributed(
600
621
  obj, request
601
622
  )
602
623
  )
624
+
625
+ # Update weight version if provided and weights update was successful
626
+ if success and obj.weight_version is not None:
627
+ _update_weight_version_if_provided(obj.weight_version)
628
+ message += f" Weight version updated to {obj.weight_version}."
629
+
603
630
  content = {"success": success, "message": message}
604
631
  if success:
605
632
  return ORJSONResponse(content, status_code=200)
@@ -607,6 +634,36 @@ async def update_weights_from_distributed(
607
634
  return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
608
635
 
609
636
 
637
+ @app.post("/update_weight_version")
638
+ async def update_weight_version(obj: UpdateWeightVersionReqInput, request: Request):
639
+ """Update the weight version. This operation requires no active requests."""
640
+ if obj.abort_all_requests:
641
+ _global_state.tokenizer_manager.abort_request(abort_all=True)
642
+
643
+ # Use a simple approach without the complex lock mechanism for now
644
+ # since weight_version update is a simple operation that doesn't affect model weights
645
+ try:
646
+ # Update the weight version in server args (the single source of truth)
647
+ _global_state.tokenizer_manager.server_args.weight_version = obj.new_version
648
+
649
+ return ORJSONResponse(
650
+ {
651
+ "success": True,
652
+ "message": f"Weight version updated to {obj.new_version}",
653
+ "new_version": obj.new_version,
654
+ },
655
+ status_code=HTTPStatus.OK,
656
+ )
657
+ except Exception as e:
658
+ return ORJSONResponse(
659
+ {
660
+ "success": False,
661
+ "message": f"Failed to update weight version: {str(e)}",
662
+ },
663
+ status_code=HTTPStatus.BAD_REQUEST,
664
+ )
665
+
666
+
610
667
  @app.api_route("/get_weights_by_name", methods=["GET", "POST"])
611
668
  async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
612
669
  """Get model parameter by name."""
@@ -967,6 +1024,12 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
967
1024
  return ORJSONResponse({"predictions": ret})
968
1025
 
969
1026
 
1027
+ def _update_weight_version_if_provided(weight_version: Optional[str]) -> None:
1028
+ """Update weight version if provided."""
1029
+ if weight_version is not None:
1030
+ _global_state.tokenizer_manager.server_args.weight_version = weight_version
1031
+
1032
+
970
1033
  def _create_error_response(e):
971
1034
  return ORJSONResponse(
972
1035
  {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
@@ -240,6 +240,7 @@ class CompletionResponse(BaseModel):
240
240
  model: str
241
241
  choices: List[CompletionResponseChoice]
242
242
  usage: UsageInfo
243
+ metadata: Optional[Dict[str, Any]] = None
243
244
 
244
245
 
245
246
  class CompletionResponseStreamChoice(BaseModel):
@@ -517,6 +518,7 @@ class ChatCompletionResponse(BaseModel):
517
518
  model: str
518
519
  choices: List[ChatCompletionResponseChoice]
519
520
  usage: UsageInfo
521
+ metadata: Optional[Dict[str, Any]] = None
520
522
 
521
523
 
522
524
  class DeltaMessage(BaseModel):
@@ -859,15 +861,6 @@ class ResponseReasoningTextContent(BaseModel):
859
861
  type: Literal["reasoning_text"] = "reasoning_text"
860
862
 
861
863
 
862
- class ResponseReasoningItem(BaseModel):
863
- id: str
864
- content: list[ResponseReasoningTextContent] = Field(default_factory=list)
865
- summary: list = Field(default_factory=list)
866
- type: Literal["reasoning"] = "reasoning"
867
- encrypted_content: Optional[str] = None
868
- status: Optional[Literal["in_progress", "completed", "incomplete"]]
869
-
870
-
871
864
  ResponseInputOutputItem: TypeAlias = Union[
872
865
  ResponseInputItemParam, "ResponseReasoningItem", ResponseFunctionToolCall
873
866
  ]
@@ -7,18 +7,8 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Union
7
7
 
8
8
  from fastapi import Request
9
9
  from fastapi.responses import ORJSONResponse, StreamingResponse
10
- from openai_harmony import Message as OpenAIMessage
11
10
 
12
11
  from sglang.srt.conversation import generate_chat_conv
13
- from sglang.srt.entrypoints.harmony_utils import (
14
- get_developer_message,
15
- get_stop_tokens_for_assistant_actions,
16
- get_streamable_parser_for_assistant,
17
- get_system_message,
18
- parse_chat_input,
19
- parse_output_into_messages,
20
- render_for_completion,
21
- )
22
12
  from sglang.srt.entrypoints.openai.protocol import (
23
13
  ChatCompletionRequest,
24
14
  ChatCompletionResponse,
@@ -57,30 +47,12 @@ class OpenAIServingChat(OpenAIServingBase):
57
47
  """Handler for /v1/chat/completions requests"""
58
48
 
59
49
  def __init__(
60
- self, tokenizer_manager: TokenizerManager, template_manager: TemplateManager
50
+ self,
51
+ tokenizer_manager: TokenizerManager,
52
+ template_manager: TemplateManager,
61
53
  ):
62
54
  super().__init__(tokenizer_manager)
63
55
  self.template_manager = template_manager
64
- self.use_harmony = (
65
- self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
66
- )
67
-
68
- if self.use_harmony:
69
- from sglang.srt.function_call.harmony_tool_parser import (
70
- HarmonyToolCallParser,
71
- )
72
-
73
- self.harmony_tool_parser = HarmonyToolCallParser()
74
-
75
- # NOTE While OpenAI's chat completion API supports browsing
76
- # for some models, currently vLLM doesn't support it. Please use the
77
- # Responses API instead.
78
- self.supports_browsing = False
79
- self.browser_tool = None
80
- # NOTE: Chat completion API does not support code interpreter.
81
- # Please use the Responses API instead.
82
- self.supports_code_interpreter = False
83
- self.python_tool = None
84
56
 
85
57
  def _request_id_prefix(self) -> str:
86
58
  return "chatcmpl-"
@@ -97,6 +69,18 @@ class OpenAIServingChat(OpenAIServingBase):
97
69
  ):
98
70
  return "Tools cannot be empty if tool choice is set to required."
99
71
 
72
+ max_output_tokens = request.max_completion_tokens or request.max_tokens
73
+ server_context_length = self.tokenizer_manager.server_args.context_length
74
+ if (
75
+ max_output_tokens
76
+ and server_context_length
77
+ and max_output_tokens > server_context_length
78
+ ):
79
+ return (
80
+ f"max_completion_tokens is too large: {max_output_tokens}."
81
+ f"This model supports at most {server_context_length} completion tokens."
82
+ )
83
+
100
84
  return None
101
85
 
102
86
  def _convert_to_internal_request(
@@ -107,66 +91,43 @@ class OpenAIServingChat(OpenAIServingBase):
107
91
  is_multimodal = self.tokenizer_manager.model_config.is_multimodal
108
92
 
109
93
  # Process messages and apply chat template
110
- if not self.use_harmony:
111
- processed_messages = self._process_messages(request, is_multimodal)
112
-
113
- # Build sampling parameters
114
- sampling_params = self._build_sampling_params(
115
- request,
116
- processed_messages.stop,
117
- processed_messages.tool_call_constraint,
118
- )
94
+ processed_messages = self._process_messages(request, is_multimodal)
119
95
 
120
- # Handle single vs multiple requests
121
- if is_multimodal:
122
- prompt_kwargs = {"text": processed_messages.prompt}
123
- else:
124
- if isinstance(processed_messages.prompt_ids, str):
125
- prompt_kwargs = {"text": processed_messages.prompt_ids}
126
- else:
127
- prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
128
-
129
- adapted_request = GenerateReqInput(
130
- **prompt_kwargs,
131
- image_data=processed_messages.image_data,
132
- video_data=processed_messages.video_data,
133
- audio_data=processed_messages.audio_data,
134
- sampling_params=sampling_params,
135
- return_logprob=request.logprobs,
136
- logprob_start_len=-1,
137
- top_logprobs_num=request.top_logprobs or 0,
138
- stream=request.stream,
139
- return_text_in_logprobs=True,
140
- modalities=processed_messages.modalities,
141
- lora_path=request.lora_path,
142
- bootstrap_host=request.bootstrap_host,
143
- bootstrap_port=request.bootstrap_port,
144
- bootstrap_room=request.bootstrap_room,
145
- return_hidden_states=request.return_hidden_states,
146
- rid=request.rid,
147
- )
96
+ # Build sampling parameters
97
+ sampling_params = self._build_sampling_params(
98
+ request,
99
+ processed_messages.stop,
100
+ processed_messages.tool_call_constraint,
101
+ )
102
+
103
+ # Handle single vs multiple requests
104
+ if is_multimodal:
105
+ prompt_kwargs = {"text": processed_messages.prompt}
148
106
  else:
149
- processed_messages, prompt_ids = self._make_request_with_harmony(request)
150
-
151
- adapted_request = GenerateReqInput(
152
- input_ids=prompt_ids,
153
- sampling_params=self._build_sampling_params(
154
- request,
155
- request.stop,
156
- tool_call_constraint=None,
157
- ),
158
- stream=request.stream,
159
- return_logprob=request.logprobs,
160
- logprob_start_len=-1,
161
- top_logprobs_num=request.top_logprobs or 0,
162
- return_text_in_logprobs=True,
163
- lora_path=request.lora_path,
164
- bootstrap_host=request.bootstrap_host,
165
- bootstrap_port=request.bootstrap_port,
166
- bootstrap_room=request.bootstrap_room,
167
- return_hidden_states=request.return_hidden_states,
168
- rid=request.rid,
169
- )
107
+ if isinstance(processed_messages.prompt_ids, str):
108
+ prompt_kwargs = {"text": processed_messages.prompt_ids}
109
+ else:
110
+ prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
111
+
112
+ adapted_request = GenerateReqInput(
113
+ **prompt_kwargs,
114
+ image_data=processed_messages.image_data,
115
+ video_data=processed_messages.video_data,
116
+ audio_data=processed_messages.audio_data,
117
+ sampling_params=sampling_params,
118
+ return_logprob=request.logprobs,
119
+ logprob_start_len=-1,
120
+ top_logprobs_num=request.top_logprobs or 0,
121
+ stream=request.stream,
122
+ return_text_in_logprobs=True,
123
+ modalities=processed_messages.modalities,
124
+ lora_path=request.lora_path,
125
+ bootstrap_host=request.bootstrap_host,
126
+ bootstrap_port=request.bootstrap_port,
127
+ bootstrap_room=request.bootstrap_room,
128
+ return_hidden_states=request.return_hidden_states,
129
+ rid=request.rid,
130
+ )
170
131
 
171
132
  return adapted_request, request
172
133
 
@@ -251,14 +212,15 @@ class OpenAIServingChat(OpenAIServingBase):
251
212
  tokenize=True,
252
213
  add_generation_prompt=True,
253
214
  tools=tools,
215
+ reasoning_effort=request.reasoning_effort,
254
216
  **(
255
217
  request.chat_template_kwargs if request.chat_template_kwargs else {}
256
218
  ),
257
219
  )
258
220
  except Exception:
259
- # This except branch will be triggered when the chosen model
260
- # has a different tools input format that is not compatible
261
- # with openAI's apply_chat_template tool_call format, like Mistral.
221
+ # This except branch will be triggered when the chosen model
222
+ # has a different tools input format that is not compatible
223
+ # with openAI's apply_chat_template tool_call format, like Mistral.
262
224
  tools = (
263
225
  [t if "function" in t else {"function": t} for t in tools]
264
226
  if tools
@@ -269,6 +231,7 @@ class OpenAIServingChat(OpenAIServingBase):
269
231
  tokenize=True,
270
232
  add_generation_prompt=True,
271
233
  tools=tools,
234
+ reasoning_effort=request.reasoning_effort,
272
235
  **(
273
236
  request.chat_template_kwargs if request.chat_template_kwargs else {}
274
237
  ),
@@ -459,12 +422,6 @@ class OpenAIServingChat(OpenAIServingBase):
459
422
  cached_tokens = {}
460
423
  hidden_states = {}
461
424
 
462
- # Harmony tracking
463
- if self.use_harmony:
464
- harmony_parsers = [
465
- get_streamable_parser_for_assistant() for _ in range(request.n)
466
- ]
467
-
468
425
  try:
469
426
  async for content in self.tokenizer_manager.generate_request(
470
427
  adapted_request, raw_request
@@ -511,58 +468,14 @@ class OpenAIServingChat(OpenAIServingBase):
511
468
  )
512
469
  yield f"data: {chunk.model_dump_json()}\n\n"
513
470
 
514
- # Process content delta
515
- if self.use_harmony:
516
- harmony_parser = harmony_parsers[index]
517
-
518
- new_token_ids = content["output_ids"]
519
- for token_id in new_token_ids:
520
- harmony_parser.process(token_id)
521
-
522
- is_final = harmony_parser.current_channel == "final"
523
- is_analysis = harmony_parser.current_channel == "analysis"
524
- delta = harmony_parser.last_content_delta or ""
525
-
526
- if is_analysis:
527
- choice_data = ChatCompletionResponseStreamChoice(
528
- index=index,
529
- delta=DeltaMessage(reasoning_content=delta),
530
- finish_reason=None,
531
- )
532
- chunk = ChatCompletionStreamResponse(
533
- id=content["meta_info"]["id"],
534
- created=int(time.time()),
535
- choices=[choice_data],
536
- model=request.model,
537
- )
538
- yield f"data: {chunk.model_dump_json()}\n\n"
539
- continue
540
-
541
- choice_data = ChatCompletionResponseStreamChoice(
542
- index=index,
543
- delta=DeltaMessage(content=delta if delta else None),
544
- finish_reason=None,
545
- matched_stop=None,
546
- logprobs=choice_logprobs,
547
- )
548
- chunk = ChatCompletionStreamResponse(
549
- id=content["meta_info"]["id"],
550
- created=int(time.time()),
551
- choices=[choice_data],
552
- model=request.model,
553
- )
554
- yield f"data: {chunk.model_dump_json()}\n\n"
555
- continue
556
- else:
557
- stream_buffer = stream_buffers.get(index, "")
558
- delta = content["text"][len(stream_buffer) :]
559
- stream_buffers[index] = stream_buffer + delta
471
+ stream_buffer = stream_buffers.get(index, "")
472
+ delta = content["text"][len(stream_buffer) :]
473
+ stream_buffers[index] = stream_buffer + delta
560
474
 
561
475
  # Handle reasoning content
562
476
  if (
563
477
  self.tokenizer_manager.server_args.reasoning_parser
564
478
  and request.separate_reasoning
565
- and not self.use_harmony
566
479
  ):
567
480
  reasoning_text, delta = self._process_reasoning_stream(
568
481
  index, delta, reasoning_parser_dict, content, request
@@ -581,27 +494,8 @@ class OpenAIServingChat(OpenAIServingBase):
581
494
  )
582
495
  yield f"data: {chunk.model_dump_json()}\n\n"
583
496
 
584
- if self.use_harmony and not is_final:
585
- choice_data = ChatCompletionResponseStreamChoice(
586
- index=index,
587
- delta=DeltaMessage(reasoning_content=delta),
588
- finish_reason=None,
589
- )
590
- chunk = ChatCompletionStreamResponse(
591
- id=content["meta_info"]["id"],
592
- created=int(time.time()),
593
- choices=[choice_data],
594
- model=request.model,
595
- )
596
- yield f"data: {chunk.model_dump_json()}\n\n"
597
-
598
497
  # Handle tool calls
599
- # TODO: support tool call parsing for harmony
600
- if (
601
- request.tool_choice != "none"
602
- and request.tools
603
- and not self.use_harmony
604
- ):
498
+ if request.tool_choice != "none" and request.tools:
605
499
  async for chunk in self._process_tool_call_stream(
606
500
  index,
607
501
  delta,
@@ -765,76 +659,6 @@ class OpenAIServingChat(OpenAIServingBase):
765
659
 
766
660
  finish_reason = ret_item["meta_info"]["finish_reason"]
767
661
  text = ret_item["text"]
768
- output_ids = ret_item["output_ids"]
769
-
770
- if self.use_harmony:
771
- parser = parse_output_into_messages(output_ids)
772
- output_msgs = parser.messages
773
- if len(output_msgs) == 0:
774
- # The generation has stopped during reasoning.
775
- is_tool_call = False
776
- reasoning_content = parser.current_content
777
- final_content = None
778
- elif len(output_msgs) == 1:
779
- # The generation has stopped during final message.
780
- is_tool_call = False
781
- reasoning_content = output_msgs[0].content[0].text
782
- final_content = parser.current_content
783
- else:
784
- if len(output_msgs) != 2:
785
- raise ValueError(
786
- "Expected 2 output messages (reasoning and final), "
787
- f"but got {len(output_msgs)}."
788
- )
789
- reasoning_msg, final_msg = output_msgs
790
- reasoning_content = reasoning_msg.content[0].text
791
- final_content = final_msg.content[0].text
792
- is_tool_call = final_msg.recipient is not None
793
-
794
- if is_tool_call:
795
- # Extract tool call information from final message
796
- tool_call = (
797
- self.harmony_tool_parser.extract_tool_calls_from_message(
798
- final_msg
799
- )
800
- )
801
- tool_calls = [tool_call] if tool_call else []
802
-
803
- message = ChatMessage(
804
- role="assistant",
805
- reasoning_content=reasoning_content,
806
- content=None, # Tool calls don't have regular content
807
- tool_calls=tool_calls,
808
- )
809
- else:
810
- # Normal message
811
- message = ChatMessage(
812
- role="assistant",
813
- reasoning_content=reasoning_content,
814
- content=final_content,
815
- )
816
-
817
- if is_tool_call:
818
- finish_reason_type = "tool_calls"
819
- elif finish_reason:
820
- finish_reason_type = (
821
- finish_reason["type"] if finish_reason else "stop"
822
- )
823
- else:
824
- finish_reason_type = "stop"
825
- choice_data = ChatCompletionResponseChoice(
826
- index=idx,
827
- message=message,
828
- logprobs=choice_logprobs,
829
- finish_reason=finish_reason_type,
830
- matched_stop=(
831
- finish_reason["matched"]
832
- if finish_reason and "matched" in finish_reason
833
- else None
834
- ),
835
- )
836
- choices.append(choice_data)
837
- continue
838
662
 
839
663
  # Handle reasoning content
840
664
  reasoning_text = None
@@ -899,6 +723,7 @@ class OpenAIServingChat(OpenAIServingBase):
899
723
  model=request.model,
900
724
  choices=choices,
901
725
  usage=usage,
726
+ metadata={"weight_version": ret[0]["meta_info"]["weight_version"]},
902
727
  )
903
728
 
904
729
  def _process_logprobs_tokens(
@@ -1184,33 +1009,3 @@ class OpenAIServingChat(OpenAIServingBase):
1184
1009
  return f"data: {chunk.model_dump_json()}\n\n"
1185
1010
 
1186
1011
  return None
1187
-
1188
- def _make_request_with_harmony(
1189
- self,
1190
- request: ChatCompletionRequest,
1191
- ):
1192
- messages: list[OpenAIMessage] = []
1193
-
1194
- # Add system message.
1195
- # In Chat Completion API, browsing is enabled by default if the model
1196
- # supports it.
1197
- assert not self.supports_browsing
1198
- assert not self.supports_code_interpreter
1199
- sys_msg = get_system_message(
1200
- reasoning_effort=request.reasoning_effort,
1201
- browser_description=None,
1202
- python_description=None,
1203
- )
1204
- messages.append(sys_msg)
1205
-
1206
- # Add developer message.
1207
- dev_msg = get_developer_message()
1208
- messages.append(dev_msg)
1209
-
1210
- # Add user message.
1211
- for chat_msg in request.messages:
1212
- messages.append(parse_chat_input(chat_msg))
1213
-
1214
- # Render prompt token ids.
1215
- prompt_token_ids = render_for_completion(messages)
1216
- return messages, prompt_token_ids
@@ -373,6 +373,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
373
373
  created=created,
374
374
  choices=choices,
375
375
  usage=usage,
376
+ metadata={"weight_version": ret[0]["meta_info"]["weight_version"]},
376
377
  )
377
378
 
378
379
  def _get_echo_text(self, request: CompletionRequest, index: int) -> str:
@@ -5,16 +5,17 @@ from abc import ABC, abstractmethod
5
5
  from contextlib import AbstractAsyncContextManager, asynccontextmanager
6
6
  from typing import Any
7
7
 
8
- logger = logging.getLogger(__name__)
9
8
  try:
10
9
  from mcp import ClientSession
11
10
  from mcp.client.sse import sse_client
12
11
  from mcp.types import ListToolsResult
13
- except ImportError:
14
- logger.warning("Ignoring mcp import error")
12
+ except ImportError as e:
13
+ ClientSession = sse_client = ListToolsResult = e
15
14
 
16
15
  from openai_harmony import ToolDescription, ToolNamespaceConfig
17
16
 
17
+ logger = logging.getLogger(__name__)
18
+
18
19
 
19
20
  async def list_server_and_tools(server_url: str):
20
21
 
@@ -316,6 +316,7 @@ class EBNFComposer:
316
316
 
317
317
  combined_args = "".join(rule_parts)
318
318
  arguments_rule = args_template.format(arg_rules=combined_args)
319
+ arguments_rule = arguments_rule or '""'
319
320
 
320
321
  # Add the function call rule and its arguments rule
321
322
  ebnf_lines.append(
@@ -11,6 +11,7 @@ from sglang.srt.function_call.base_format_detector import BaseFormatDetector
11
11
  from sglang.srt.function_call.core_types import ToolCallItem
12
12
  from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
13
13
  from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
14
+ from sglang.srt.function_call.gpt_oss_detector import GptOssDetector
14
15
  from sglang.srt.function_call.kimik2_detector import KimiK2Detector
15
16
  from sglang.srt.function_call.llama32_detector import Llama32Detector
16
17
  from sglang.srt.function_call.mistral_detector import MistralDetector
@@ -41,6 +42,7 @@ class FunctionCallParser:
41
42
  "qwen3_coder": Qwen3CoderDetector,
42
43
  "glm45": Glm4MoeDetector,
43
44
  "step3": Step3Detector,
45
+ "gpt-oss": GptOssDetector,
44
46
  }
45
47
 
46
48
  def __init__(self, tools: List[Tool], tool_call_parser: str):
@@ -158,7 +158,7 @@ class Glm4MoeDetector(BaseFormatDetector):
158
158
  individual_call_end_token=self.eot_token,
159
159
  tool_call_separator="\\n",
160
160
  function_format="xml",
161
- call_rule_fmt='"{name}" "\\n" {arguments_rule} "\\n"',
161
+ call_rule_fmt='"{name}" "\\n" ( {arguments_rule} "\\n" )?',
162
162
  key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
163
163
  key_value_separator="\\n",
164
164
  )