sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +6 -1
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +8 -7
- sglang/srt/disaggregation/decode.py +8 -4
- sglang/srt/disaggregation/mooncake/conn.py +43 -25
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/distributed/parallel_state.py +4 -2
- sglang/srt/entrypoints/context.py +3 -20
- sglang/srt/entrypoints/engine.py +13 -8
- sglang/srt/entrypoints/harmony_utils.py +2 -0
- sglang/srt/entrypoints/http_server.py +68 -5
- sglang/srt/entrypoints/openai/protocol.py +2 -9
- sglang/srt/entrypoints/openai/serving_chat.py +60 -265
- sglang/srt/entrypoints/openai/serving_completions.py +1 -0
- sglang/srt/entrypoints/openai/tool_server.py +4 -3
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/jinja_template_utils.py +6 -0
- sglang/srt/layers/attention/aiter_backend.py +370 -107
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +55 -13
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +24 -27
- sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
- sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
- sglang/srt/layers/attention/vision.py +9 -1
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +11 -13
- sglang/srt/layers/dp_attention.py +118 -27
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +1 -0
- sglang/srt/layers/logits_processor.py +12 -18
- sglang/srt/layers/moe/cutlass_moe.py +11 -16
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +60 -2
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +4 -1
- sglang/srt/layers/multimodal.py +156 -40
- sglang/srt/layers/quantization/__init__.py +10 -35
- sglang/srt/layers/quantization/awq.py +15 -16
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +22 -10
- sglang/srt/layers/quantization/gptq.py +12 -17
- sglang/srt/layers/quantization/marlin_utils.py +15 -5
- sglang/srt/layers/quantization/modelopt_quant.py +58 -41
- sglang/srt/layers/quantization/mxfp4.py +20 -3
- sglang/srt/layers/quantization/utils.py +52 -2
- sglang/srt/layers/quantization/w4afp8.py +20 -11
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +281 -2
- sglang/srt/layers/sampler.py +5 -2
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +66 -116
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +12 -48
- sglang/srt/lora/lora_registry.py +20 -9
- sglang/srt/lora/mem_pool.py +20 -63
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +24 -29
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +20 -6
- sglang/srt/managers/mm_utils.py +1 -2
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +43 -49
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +18 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/tokenizer_manager.py +53 -44
- sglang/srt/mem_cache/allocator.py +39 -214
- sglang/srt/mem_cache/allocator_ascend.py +158 -0
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +34 -24
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +33 -35
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +29 -23
- sglang/srt/model_executor/forward_batch_info.py +33 -14
- sglang/srt/model_executor/model_runner.py +179 -81
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/models/deepseek_nextn.py +2 -1
- sglang/srt/models/deepseek_v2.py +79 -38
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +8 -9
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +11 -11
- sglang/srt/models/glm4_moe_nextn.py +2 -1
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +142 -20
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +10 -27
- sglang/srt/models/llama4.py +19 -6
- sglang/srt/models/qwen2.py +2 -2
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen2_moe.py +20 -5
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/qwen3_classification.py +78 -0
- sglang/srt/models/qwen3_moe.py +18 -5
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/step3_vl.py +6 -2
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/operations.py +17 -2
- sglang/srt/reasoning_parser.py +316 -0
- sglang/srt/sampling/sampling_batch_info.py +7 -4
- sglang/srt/server_args.py +142 -140
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +16 -12
- sglang/srt/utils.py +3 -3
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/test/test_marlin_moe.py +1 -1
- sglang/test/test_marlin_utils.py +1 -1
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
- sglang/lang/backend/__init__.py +0 -0
- sglang/srt/function_call/harmony_tool_parser.py +0 -130
- sglang/srt/layers/quantization/scalar_type.py +0 -352
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -26,7 +26,7 @@ import os
|
|
26
26
|
import threading
|
27
27
|
import time
|
28
28
|
from http import HTTPStatus
|
29
|
-
from typing import AsyncIterator, Callable, Dict, Optional
|
29
|
+
from typing import Any, AsyncIterator, Callable, Dict, List, Optional
|
30
30
|
|
31
31
|
# Fix a bug of Python threading
|
32
32
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
@@ -88,6 +88,7 @@ from sglang.srt.managers.io_struct import (
|
|
88
88
|
UpdateWeightFromDiskReqInput,
|
89
89
|
UpdateWeightsFromDistributedReqInput,
|
90
90
|
UpdateWeightsFromTensorReqInput,
|
91
|
+
UpdateWeightVersionReqInput,
|
91
92
|
VertexGenerateReqInput,
|
92
93
|
)
|
93
94
|
from sglang.srt.managers.template_manager import TemplateManager
|
@@ -174,7 +175,6 @@ async def lifespan(fast_api_app: FastAPI):
|
|
174
175
|
tool_server=tool_server,
|
175
176
|
)
|
176
177
|
except Exception as e:
|
177
|
-
# print stack trace
|
178
178
|
import traceback
|
179
179
|
|
180
180
|
traceback.print_exc()
|
@@ -277,7 +277,7 @@ async def health_generate(request: Request) -> Response:
|
|
277
277
|
logger.info("Health check request received during shutdown. Returning 503.")
|
278
278
|
return Response(status_code=503)
|
279
279
|
|
280
|
-
if
|
280
|
+
if _global_state.tokenizer_manager.server_status == ServerStatus.Starting:
|
281
281
|
return Response(status_code=503)
|
282
282
|
|
283
283
|
sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
|
@@ -317,7 +317,7 @@ async def health_generate(request: Request) -> Response:
|
|
317
317
|
if _global_state.tokenizer_manager.last_receive_tstamp > tic:
|
318
318
|
task.cancel()
|
319
319
|
_global_state.tokenizer_manager.rid_to_state.pop(rid, None)
|
320
|
-
_global_state.tokenizer_manager.
|
320
|
+
_global_state.tokenizer_manager.server_status = ServerStatus.Up
|
321
321
|
return Response(status_code=200)
|
322
322
|
|
323
323
|
task.cancel()
|
@@ -331,7 +331,7 @@ async def health_generate(request: Request) -> Response:
|
|
331
331
|
f"last_heartbeat time: {last_receive_time}"
|
332
332
|
)
|
333
333
|
_global_state.tokenizer_manager.rid_to_state.pop(rid, None)
|
334
|
-
_global_state.tokenizer_manager.
|
334
|
+
_global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
|
335
335
|
return Response(status_code=503)
|
336
336
|
|
337
337
|
|
@@ -343,10 +343,19 @@ async def get_model_info():
|
|
343
343
|
"tokenizer_path": _global_state.tokenizer_manager.server_args.tokenizer_path,
|
344
344
|
"is_generation": _global_state.tokenizer_manager.is_generation,
|
345
345
|
"preferred_sampling_params": _global_state.tokenizer_manager.server_args.preferred_sampling_params,
|
346
|
+
"weight_version": _global_state.tokenizer_manager.server_args.weight_version,
|
346
347
|
}
|
347
348
|
return result
|
348
349
|
|
349
350
|
|
351
|
+
@app.get("/get_weight_version")
|
352
|
+
async def get_weight_version():
|
353
|
+
"""Get the current weight version."""
|
354
|
+
return {
|
355
|
+
"weight_version": _global_state.tokenizer_manager.server_args.weight_version
|
356
|
+
}
|
357
|
+
|
358
|
+
|
350
359
|
@app.get("/get_server_info")
|
351
360
|
async def get_server_info():
|
352
361
|
# Returns interna states per DP.
|
@@ -538,6 +547,12 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
|
|
538
547
|
success, message, num_paused_requests = (
|
539
548
|
await _global_state.tokenizer_manager.update_weights_from_disk(obj, request)
|
540
549
|
)
|
550
|
+
|
551
|
+
# Update weight version if provided and weights update was successful
|
552
|
+
if success and obj.weight_version is not None:
|
553
|
+
_update_weight_version_if_provided(obj.weight_version)
|
554
|
+
message += f" Weight version updated to {obj.weight_version}."
|
555
|
+
|
541
556
|
content = {
|
542
557
|
"success": success,
|
543
558
|
"message": message,
|
@@ -584,6 +599,12 @@ async def update_weights_from_tensor(
|
|
584
599
|
success, message = await _global_state.tokenizer_manager.update_weights_from_tensor(
|
585
600
|
obj, request
|
586
601
|
)
|
602
|
+
|
603
|
+
# Update weight version if provided and weights update was successful
|
604
|
+
if success and obj.weight_version is not None:
|
605
|
+
_update_weight_version_if_provided(obj.weight_version)
|
606
|
+
message += f" Weight version updated to {obj.weight_version}."
|
607
|
+
|
587
608
|
content = {"success": success, "message": message}
|
588
609
|
return ORJSONResponse(
|
589
610
|
content, status_code=200 if success else HTTPStatus.BAD_REQUEST
|
@@ -600,6 +621,12 @@ async def update_weights_from_distributed(
|
|
600
621
|
obj, request
|
601
622
|
)
|
602
623
|
)
|
624
|
+
|
625
|
+
# Update weight version if provided and weights update was successful
|
626
|
+
if success and obj.weight_version is not None:
|
627
|
+
_update_weight_version_if_provided(obj.weight_version)
|
628
|
+
message += f" Weight version updated to {obj.weight_version}."
|
629
|
+
|
603
630
|
content = {"success": success, "message": message}
|
604
631
|
if success:
|
605
632
|
return ORJSONResponse(content, status_code=200)
|
@@ -607,6 +634,36 @@ async def update_weights_from_distributed(
|
|
607
634
|
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
608
635
|
|
609
636
|
|
637
|
+
@app.post("/update_weight_version")
|
638
|
+
async def update_weight_version(obj: UpdateWeightVersionReqInput, request: Request):
|
639
|
+
"""Update the weight version. This operation requires no active requests."""
|
640
|
+
if obj.abort_all_requests:
|
641
|
+
_global_state.tokenizer_manager.abort_request(abort_all=True)
|
642
|
+
|
643
|
+
# Use a simple approach without the complex lock mechanism for now
|
644
|
+
# since weight_version update is a simple operation that doesn't affect model weights
|
645
|
+
try:
|
646
|
+
# Update the weight version in server args (the single source of truth)
|
647
|
+
_global_state.tokenizer_manager.server_args.weight_version = obj.new_version
|
648
|
+
|
649
|
+
return ORJSONResponse(
|
650
|
+
{
|
651
|
+
"success": True,
|
652
|
+
"message": f"Weight version updated to {obj.new_version}",
|
653
|
+
"new_version": obj.new_version,
|
654
|
+
},
|
655
|
+
status_code=HTTPStatus.OK,
|
656
|
+
)
|
657
|
+
except Exception as e:
|
658
|
+
return ORJSONResponse(
|
659
|
+
{
|
660
|
+
"success": False,
|
661
|
+
"message": f"Failed to update weight version: {str(e)}",
|
662
|
+
},
|
663
|
+
status_code=HTTPStatus.BAD_REQUEST,
|
664
|
+
)
|
665
|
+
|
666
|
+
|
610
667
|
@app.api_route("/get_weights_by_name", methods=["GET", "POST"])
|
611
668
|
async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
|
612
669
|
"""Get model parameter by name."""
|
@@ -967,6 +1024,12 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
|
|
967
1024
|
return ORJSONResponse({"predictions": ret})
|
968
1025
|
|
969
1026
|
|
1027
|
+
def _update_weight_version_if_provided(weight_version: Optional[str]) -> None:
|
1028
|
+
"""Update weight version if provided."""
|
1029
|
+
if weight_version is not None:
|
1030
|
+
_global_state.tokenizer_manager.server_args.weight_version = weight_version
|
1031
|
+
|
1032
|
+
|
970
1033
|
def _create_error_response(e):
|
971
1034
|
return ORJSONResponse(
|
972
1035
|
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
@@ -240,6 +240,7 @@ class CompletionResponse(BaseModel):
|
|
240
240
|
model: str
|
241
241
|
choices: List[CompletionResponseChoice]
|
242
242
|
usage: UsageInfo
|
243
|
+
metadata: Optional[Dict[str, Any]] = None
|
243
244
|
|
244
245
|
|
245
246
|
class CompletionResponseStreamChoice(BaseModel):
|
@@ -517,6 +518,7 @@ class ChatCompletionResponse(BaseModel):
|
|
517
518
|
model: str
|
518
519
|
choices: List[ChatCompletionResponseChoice]
|
519
520
|
usage: UsageInfo
|
521
|
+
metadata: Optional[Dict[str, Any]] = None
|
520
522
|
|
521
523
|
|
522
524
|
class DeltaMessage(BaseModel):
|
@@ -859,15 +861,6 @@ class ResponseReasoningTextContent(BaseModel):
|
|
859
861
|
type: Literal["reasoning_text"] = "reasoning_text"
|
860
862
|
|
861
863
|
|
862
|
-
class ResponseReasoningItem(BaseModel):
|
863
|
-
id: str
|
864
|
-
content: list[ResponseReasoningTextContent] = Field(default_factory=list)
|
865
|
-
summary: list = Field(default_factory=list)
|
866
|
-
type: Literal["reasoning"] = "reasoning"
|
867
|
-
encrypted_content: Optional[str] = None
|
868
|
-
status: Optional[Literal["in_progress", "completed", "incomplete"]]
|
869
|
-
|
870
|
-
|
871
864
|
ResponseInputOutputItem: TypeAlias = Union[
|
872
865
|
ResponseInputItemParam, "ResponseReasoningItem", ResponseFunctionToolCall
|
873
866
|
]
|
@@ -7,18 +7,8 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
|
7
7
|
|
8
8
|
from fastapi import Request
|
9
9
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
10
|
-
from openai_harmony import Message as OpenAIMessage
|
11
10
|
|
12
11
|
from sglang.srt.conversation import generate_chat_conv
|
13
|
-
from sglang.srt.entrypoints.harmony_utils import (
|
14
|
-
get_developer_message,
|
15
|
-
get_stop_tokens_for_assistant_actions,
|
16
|
-
get_streamable_parser_for_assistant,
|
17
|
-
get_system_message,
|
18
|
-
parse_chat_input,
|
19
|
-
parse_output_into_messages,
|
20
|
-
render_for_completion,
|
21
|
-
)
|
22
12
|
from sglang.srt.entrypoints.openai.protocol import (
|
23
13
|
ChatCompletionRequest,
|
24
14
|
ChatCompletionResponse,
|
@@ -57,30 +47,12 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
57
47
|
"""Handler for /v1/chat/completions requests"""
|
58
48
|
|
59
49
|
def __init__(
|
60
|
-
self,
|
50
|
+
self,
|
51
|
+
tokenizer_manager: TokenizerManager,
|
52
|
+
template_manager: TemplateManager,
|
61
53
|
):
|
62
54
|
super().__init__(tokenizer_manager)
|
63
55
|
self.template_manager = template_manager
|
64
|
-
self.use_harmony = (
|
65
|
-
self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
|
66
|
-
)
|
67
|
-
|
68
|
-
if self.use_harmony:
|
69
|
-
from sglang.srt.function_call.harmony_tool_parser import (
|
70
|
-
HarmonyToolCallParser,
|
71
|
-
)
|
72
|
-
|
73
|
-
self.harmony_tool_parser = HarmonyToolCallParser()
|
74
|
-
|
75
|
-
# NOTE While OpenAI's chat completion API supports browsing
|
76
|
-
# for some models, currently vLLM doesn't support it. Please use the
|
77
|
-
# Responses API instead.
|
78
|
-
self.supports_browsing = False
|
79
|
-
self.browser_tool = None
|
80
|
-
# NOTE: Chat completion API does not support code interpreter.
|
81
|
-
# Please use the Responses API instead.
|
82
|
-
self.supports_code_interpreter = False
|
83
|
-
self.python_tool = None
|
84
56
|
|
85
57
|
def _request_id_prefix(self) -> str:
|
86
58
|
return "chatcmpl-"
|
@@ -97,6 +69,18 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
97
69
|
):
|
98
70
|
return "Tools cannot be empty if tool choice is set to required."
|
99
71
|
|
72
|
+
max_output_tokens = request.max_completion_tokens or request.max_tokens
|
73
|
+
server_context_length = self.tokenizer_manager.server_args.context_length
|
74
|
+
if (
|
75
|
+
max_output_tokens
|
76
|
+
and server_context_length
|
77
|
+
and max_output_tokens > server_context_length
|
78
|
+
):
|
79
|
+
return (
|
80
|
+
f"max_completion_tokens is too large: {max_output_tokens}."
|
81
|
+
f"This model supports at most {server_context_length} completion tokens."
|
82
|
+
)
|
83
|
+
|
100
84
|
return None
|
101
85
|
|
102
86
|
def _convert_to_internal_request(
|
@@ -107,66 +91,43 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
107
91
|
is_multimodal = self.tokenizer_manager.model_config.is_multimodal
|
108
92
|
|
109
93
|
# Process messages and apply chat template
|
110
|
-
|
111
|
-
processed_messages = self._process_messages(request, is_multimodal)
|
112
|
-
|
113
|
-
# Build sampling parameters
|
114
|
-
sampling_params = self._build_sampling_params(
|
115
|
-
request,
|
116
|
-
processed_messages.stop,
|
117
|
-
processed_messages.tool_call_constraint,
|
118
|
-
)
|
94
|
+
processed_messages = self._process_messages(request, is_multimodal)
|
119
95
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
**prompt_kwargs,
|
131
|
-
image_data=processed_messages.image_data,
|
132
|
-
video_data=processed_messages.video_data,
|
133
|
-
audio_data=processed_messages.audio_data,
|
134
|
-
sampling_params=sampling_params,
|
135
|
-
return_logprob=request.logprobs,
|
136
|
-
logprob_start_len=-1,
|
137
|
-
top_logprobs_num=request.top_logprobs or 0,
|
138
|
-
stream=request.stream,
|
139
|
-
return_text_in_logprobs=True,
|
140
|
-
modalities=processed_messages.modalities,
|
141
|
-
lora_path=request.lora_path,
|
142
|
-
bootstrap_host=request.bootstrap_host,
|
143
|
-
bootstrap_port=request.bootstrap_port,
|
144
|
-
bootstrap_room=request.bootstrap_room,
|
145
|
-
return_hidden_states=request.return_hidden_states,
|
146
|
-
rid=request.rid,
|
147
|
-
)
|
96
|
+
# Build sampling parameters
|
97
|
+
sampling_params = self._build_sampling_params(
|
98
|
+
request,
|
99
|
+
processed_messages.stop,
|
100
|
+
processed_messages.tool_call_constraint,
|
101
|
+
)
|
102
|
+
|
103
|
+
# Handle single vs multiple requests
|
104
|
+
if is_multimodal:
|
105
|
+
prompt_kwargs = {"text": processed_messages.prompt}
|
148
106
|
else:
|
149
|
-
processed_messages,
|
150
|
-
|
151
|
-
|
152
|
-
input_ids
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
107
|
+
if isinstance(processed_messages.prompt_ids, str):
|
108
|
+
prompt_kwargs = {"text": processed_messages.prompt_ids}
|
109
|
+
else:
|
110
|
+
prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
|
111
|
+
|
112
|
+
adapted_request = GenerateReqInput(
|
113
|
+
**prompt_kwargs,
|
114
|
+
image_data=processed_messages.image_data,
|
115
|
+
video_data=processed_messages.video_data,
|
116
|
+
audio_data=processed_messages.audio_data,
|
117
|
+
sampling_params=sampling_params,
|
118
|
+
return_logprob=request.logprobs,
|
119
|
+
logprob_start_len=-1,
|
120
|
+
top_logprobs_num=request.top_logprobs or 0,
|
121
|
+
stream=request.stream,
|
122
|
+
return_text_in_logprobs=True,
|
123
|
+
modalities=processed_messages.modalities,
|
124
|
+
lora_path=request.lora_path,
|
125
|
+
bootstrap_host=request.bootstrap_host,
|
126
|
+
bootstrap_port=request.bootstrap_port,
|
127
|
+
bootstrap_room=request.bootstrap_room,
|
128
|
+
return_hidden_states=request.return_hidden_states,
|
129
|
+
rid=request.rid,
|
130
|
+
)
|
170
131
|
|
171
132
|
return adapted_request, request
|
172
133
|
|
@@ -251,14 +212,15 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
251
212
|
tokenize=True,
|
252
213
|
add_generation_prompt=True,
|
253
214
|
tools=tools,
|
215
|
+
reasoning_effort=request.reasoning_effort,
|
254
216
|
**(
|
255
217
|
request.chat_template_kwargs if request.chat_template_kwargs else {}
|
256
218
|
),
|
257
219
|
)
|
258
220
|
except Exception:
|
259
|
-
#
|
260
|
-
#
|
261
|
-
#
|
221
|
+
# This except branch will be triggered when the chosen model
|
222
|
+
# has a different tools input format that is not compatible
|
223
|
+
# with openAI's apply_chat_template tool_call format, like Mistral.
|
262
224
|
tools = (
|
263
225
|
[t if "function" in t else {"function": t} for t in tools]
|
264
226
|
if tools
|
@@ -269,6 +231,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
269
231
|
tokenize=True,
|
270
232
|
add_generation_prompt=True,
|
271
233
|
tools=tools,
|
234
|
+
reasoning_effort=request.reasoning_effort,
|
272
235
|
**(
|
273
236
|
request.chat_template_kwargs if request.chat_template_kwargs else {}
|
274
237
|
),
|
@@ -459,12 +422,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
459
422
|
cached_tokens = {}
|
460
423
|
hidden_states = {}
|
461
424
|
|
462
|
-
# Harmony tracking
|
463
|
-
if self.use_harmony:
|
464
|
-
harmony_parsers = [
|
465
|
-
get_streamable_parser_for_assistant() for _ in range(request.n)
|
466
|
-
]
|
467
|
-
|
468
425
|
try:
|
469
426
|
async for content in self.tokenizer_manager.generate_request(
|
470
427
|
adapted_request, raw_request
|
@@ -511,58 +468,14 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
511
468
|
)
|
512
469
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
513
470
|
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
new_token_ids = content["output_ids"]
|
519
|
-
for token_id in new_token_ids:
|
520
|
-
harmony_parser.process(token_id)
|
521
|
-
|
522
|
-
is_final = harmony_parser.current_channel == "final"
|
523
|
-
is_analysis = harmony_parser.current_channel == "analysis"
|
524
|
-
delta = harmony_parser.last_content_delta or ""
|
525
|
-
|
526
|
-
if is_analysis:
|
527
|
-
choice_data = ChatCompletionResponseStreamChoice(
|
528
|
-
index=index,
|
529
|
-
delta=DeltaMessage(reasoning_content=delta),
|
530
|
-
finish_reason=None,
|
531
|
-
)
|
532
|
-
chunk = ChatCompletionStreamResponse(
|
533
|
-
id=content["meta_info"]["id"],
|
534
|
-
created=int(time.time()),
|
535
|
-
choices=[choice_data],
|
536
|
-
model=request.model,
|
537
|
-
)
|
538
|
-
yield f"data: {chunk.model_dump_json()}\n\n"
|
539
|
-
continue
|
540
|
-
|
541
|
-
choice_data = ChatCompletionResponseStreamChoice(
|
542
|
-
index=index,
|
543
|
-
delta=DeltaMessage(content=delta if delta else None),
|
544
|
-
finish_reason=None,
|
545
|
-
matched_stop=None,
|
546
|
-
logprobs=choice_logprobs,
|
547
|
-
)
|
548
|
-
chunk = ChatCompletionStreamResponse(
|
549
|
-
id=content["meta_info"]["id"],
|
550
|
-
created=int(time.time()),
|
551
|
-
choices=[choice_data],
|
552
|
-
model=request.model,
|
553
|
-
)
|
554
|
-
yield f"data: {chunk.model_dump_json()}\n\n"
|
555
|
-
continue
|
556
|
-
else:
|
557
|
-
stream_buffer = stream_buffers.get(index, "")
|
558
|
-
delta = content["text"][len(stream_buffer) :]
|
559
|
-
stream_buffers[index] = stream_buffer + delta
|
471
|
+
stream_buffer = stream_buffers.get(index, "")
|
472
|
+
delta = content["text"][len(stream_buffer) :]
|
473
|
+
stream_buffers[index] = stream_buffer + delta
|
560
474
|
|
561
475
|
# Handle reasoning content
|
562
476
|
if (
|
563
477
|
self.tokenizer_manager.server_args.reasoning_parser
|
564
478
|
and request.separate_reasoning
|
565
|
-
and not self.use_harmony
|
566
479
|
):
|
567
480
|
reasoning_text, delta = self._process_reasoning_stream(
|
568
481
|
index, delta, reasoning_parser_dict, content, request
|
@@ -581,27 +494,8 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
581
494
|
)
|
582
495
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
583
496
|
|
584
|
-
if self.use_harmony and not is_final:
|
585
|
-
choice_data = ChatCompletionResponseStreamChoice(
|
586
|
-
index=index,
|
587
|
-
delta=DeltaMessage(reasoning_content=delta),
|
588
|
-
finish_reason=None,
|
589
|
-
)
|
590
|
-
chunk = ChatCompletionStreamResponse(
|
591
|
-
id=content["meta_info"]["id"],
|
592
|
-
created=int(time.time()),
|
593
|
-
choices=[choice_data],
|
594
|
-
model=request.model,
|
595
|
-
)
|
596
|
-
yield f"data: {chunk.model_dump_json()}\n\n"
|
597
|
-
|
598
497
|
# Handle tool calls
|
599
|
-
|
600
|
-
if (
|
601
|
-
request.tool_choice != "none"
|
602
|
-
and request.tools
|
603
|
-
and not self.use_harmony
|
604
|
-
):
|
498
|
+
if request.tool_choice != "none" and request.tools:
|
605
499
|
async for chunk in self._process_tool_call_stream(
|
606
500
|
index,
|
607
501
|
delta,
|
@@ -765,76 +659,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
765
659
|
|
766
660
|
finish_reason = ret_item["meta_info"]["finish_reason"]
|
767
661
|
text = ret_item["text"]
|
768
|
-
output_ids = ret_item["output_ids"]
|
769
|
-
|
770
|
-
if self.use_harmony:
|
771
|
-
parser = parse_output_into_messages(output_ids)
|
772
|
-
output_msgs = parser.messages
|
773
|
-
if len(output_msgs) == 0:
|
774
|
-
# The generation has stopped during reasoning.
|
775
|
-
is_tool_call = False
|
776
|
-
reasoning_content = parser.current_content
|
777
|
-
final_content = None
|
778
|
-
elif len(output_msgs) == 1:
|
779
|
-
# The generation has stopped during final message.
|
780
|
-
is_tool_call = False
|
781
|
-
reasoning_content = output_msgs[0].content[0].text
|
782
|
-
final_content = parser.current_content
|
783
|
-
else:
|
784
|
-
if len(output_msgs) != 2:
|
785
|
-
raise ValueError(
|
786
|
-
"Expected 2 output messages (reasoning and final), "
|
787
|
-
f"but got {len(output_msgs)}."
|
788
|
-
)
|
789
|
-
reasoning_msg, final_msg = output_msgs
|
790
|
-
reasoning_content = reasoning_msg.content[0].text
|
791
|
-
final_content = final_msg.content[0].text
|
792
|
-
is_tool_call = final_msg.recipient is not None
|
793
|
-
|
794
|
-
if is_tool_call:
|
795
|
-
# Extract tool call information from final message
|
796
|
-
tool_call = (
|
797
|
-
self.harmony_tool_parser.extract_tool_calls_from_message(
|
798
|
-
final_msg
|
799
|
-
)
|
800
|
-
)
|
801
|
-
tool_calls = [tool_call] if tool_call else []
|
802
|
-
|
803
|
-
message = ChatMessage(
|
804
|
-
role="assistant",
|
805
|
-
reasoning_content=reasoning_content,
|
806
|
-
content=None, # Tool calls don't have regular content
|
807
|
-
tool_calls=tool_calls,
|
808
|
-
)
|
809
|
-
else:
|
810
|
-
# Normal message
|
811
|
-
message = ChatMessage(
|
812
|
-
role="assistant",
|
813
|
-
reasoning_content=reasoning_content,
|
814
|
-
content=final_content,
|
815
|
-
)
|
816
|
-
|
817
|
-
if is_tool_call:
|
818
|
-
finish_reason_type = "tool_calls"
|
819
|
-
elif finish_reason:
|
820
|
-
finish_reason_type = (
|
821
|
-
finish_reason["type"] if finish_reason else "stop"
|
822
|
-
)
|
823
|
-
else:
|
824
|
-
finish_reason_type = "stop"
|
825
|
-
choice_data = ChatCompletionResponseChoice(
|
826
|
-
index=idx,
|
827
|
-
message=message,
|
828
|
-
logprobs=choice_logprobs,
|
829
|
-
finish_reason=finish_reason_type,
|
830
|
-
matched_stop=(
|
831
|
-
finish_reason["matched"]
|
832
|
-
if finish_reason and "matched" in finish_reason
|
833
|
-
else None
|
834
|
-
),
|
835
|
-
)
|
836
|
-
choices.append(choice_data)
|
837
|
-
continue
|
838
662
|
|
839
663
|
# Handle reasoning content
|
840
664
|
reasoning_text = None
|
@@ -899,6 +723,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
899
723
|
model=request.model,
|
900
724
|
choices=choices,
|
901
725
|
usage=usage,
|
726
|
+
metadata={"weight_version": ret[0]["meta_info"]["weight_version"]},
|
902
727
|
)
|
903
728
|
|
904
729
|
def _process_logprobs_tokens(
|
@@ -1184,33 +1009,3 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
1184
1009
|
return f"data: {chunk.model_dump_json()}\n\n"
|
1185
1010
|
|
1186
1011
|
return None
|
1187
|
-
|
1188
|
-
def _make_request_with_harmony(
|
1189
|
-
self,
|
1190
|
-
request: ChatCompletionRequest,
|
1191
|
-
):
|
1192
|
-
messages: list[OpenAIMessage] = []
|
1193
|
-
|
1194
|
-
# Add system message.
|
1195
|
-
# In Chat Completion API, browsing is enabled by default if the model
|
1196
|
-
# supports it.
|
1197
|
-
assert not self.supports_browsing
|
1198
|
-
assert not self.supports_code_interpreter
|
1199
|
-
sys_msg = get_system_message(
|
1200
|
-
reasoning_effort=request.reasoning_effort,
|
1201
|
-
browser_description=None,
|
1202
|
-
python_description=None,
|
1203
|
-
)
|
1204
|
-
messages.append(sys_msg)
|
1205
|
-
|
1206
|
-
# Add developer message.
|
1207
|
-
dev_msg = get_developer_message()
|
1208
|
-
messages.append(dev_msg)
|
1209
|
-
|
1210
|
-
# Add user message.
|
1211
|
-
for chat_msg in request.messages:
|
1212
|
-
messages.append(parse_chat_input(chat_msg))
|
1213
|
-
|
1214
|
-
# Render prompt token ids.
|
1215
|
-
prompt_token_ids = render_for_completion(messages)
|
1216
|
-
return messages, prompt_token_ids
|
@@ -373,6 +373,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
373
373
|
created=created,
|
374
374
|
choices=choices,
|
375
375
|
usage=usage,
|
376
|
+
metadata={"weight_version": ret[0]["meta_info"]["weight_version"]},
|
376
377
|
)
|
377
378
|
|
378
379
|
def _get_echo_text(self, request: CompletionRequest, index: int) -> str:
|
@@ -5,16 +5,17 @@ from abc import ABC, abstractmethod
|
|
5
5
|
from contextlib import AbstractAsyncContextManager, asynccontextmanager
|
6
6
|
from typing import Any
|
7
7
|
|
8
|
-
logger = logging.getLogger(__name__)
|
9
8
|
try:
|
10
9
|
from mcp import ClientSession
|
11
10
|
from mcp.client.sse import sse_client
|
12
11
|
from mcp.types import ListToolsResult
|
13
|
-
except ImportError:
|
14
|
-
|
12
|
+
except ImportError as e:
|
13
|
+
ClientSession = sse_client = ListToolsResult = e
|
15
14
|
|
16
15
|
from openai_harmony import ToolDescription, ToolNamespaceConfig
|
17
16
|
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
18
19
|
|
19
20
|
async def list_server_and_tools(server_url: str):
|
20
21
|
|
@@ -316,6 +316,7 @@ class EBNFComposer:
|
|
316
316
|
|
317
317
|
combined_args = "".join(rule_parts)
|
318
318
|
arguments_rule = args_template.format(arg_rules=combined_args)
|
319
|
+
arguments_rule = arguments_rule or '""'
|
319
320
|
|
320
321
|
# Add the function call rule and its arguments rule
|
321
322
|
ebnf_lines.append(
|
@@ -11,6 +11,7 @@ from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
|
11
11
|
from sglang.srt.function_call.core_types import ToolCallItem
|
12
12
|
from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
|
13
13
|
from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
|
14
|
+
from sglang.srt.function_call.gpt_oss_detector import GptOssDetector
|
14
15
|
from sglang.srt.function_call.kimik2_detector import KimiK2Detector
|
15
16
|
from sglang.srt.function_call.llama32_detector import Llama32Detector
|
16
17
|
from sglang.srt.function_call.mistral_detector import MistralDetector
|
@@ -41,6 +42,7 @@ class FunctionCallParser:
|
|
41
42
|
"qwen3_coder": Qwen3CoderDetector,
|
42
43
|
"glm45": Glm4MoeDetector,
|
43
44
|
"step3": Step3Detector,
|
45
|
+
"gpt-oss": GptOssDetector,
|
44
46
|
}
|
45
47
|
|
46
48
|
def __init__(self, tools: List[Tool], tool_call_parser: str):
|
@@ -158,7 +158,7 @@ class Glm4MoeDetector(BaseFormatDetector):
|
|
158
158
|
individual_call_end_token=self.eot_token,
|
159
159
|
tool_call_separator="\\n",
|
160
160
|
function_format="xml",
|
161
|
-
call_rule_fmt='"{name}" "\\n" {arguments_rule} "\\n"',
|
161
|
+
call_rule_fmt='"{name}" "\\n" ( {arguments_rule} "\\n" )?',
|
162
162
|
key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
|
163
163
|
key_value_separator="\\n",
|
164
164
|
)
|