sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +119 -17
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +42 -7
- sglang/srt/conversation.py +9 -5
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +14 -4
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
- sglang/srt/disaggregation/mooncake/conn.py +286 -160
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/distributed/parallel_state.py +15 -11
- sglang/srt/entrypoints/context.py +227 -0
- sglang/srt/entrypoints/engine.py +15 -9
- sglang/srt/entrypoints/harmony_utils.py +372 -0
- sglang/srt/entrypoints/http_server.py +74 -4
- sglang/srt/entrypoints/openai/protocol.py +218 -1
- sglang/srt/entrypoints/openai/serving_chat.py +41 -11
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +175 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/hf_transformers_utils.py +30 -3
- sglang/srt/jinja_template_utils.py +14 -1
- sglang/srt/layers/attention/aiter_backend.py +375 -115
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +52 -13
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
- sglang/srt/layers/attention/vision.py +22 -6
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +29 -14
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +3 -7
- sglang/srt/layers/moe/cutlass_moe.py +12 -3
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +135 -73
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +16 -4
- sglang/srt/layers/moe/utils.py +16 -0
- sglang/srt/layers/quantization/__init__.py +27 -3
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +3 -6
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +51 -10
- sglang/srt/layers/quantization/modelopt_quant.py +258 -68
- sglang/srt/layers/quantization/mxfp4.py +654 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +21 -12
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +506 -3
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +8 -3
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +60 -114
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +82 -62
- sglang/srt/lora/lora_registry.py +23 -11
- sglang/srt/lora/mem_pool.py +63 -68
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +75 -58
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +20 -8
- sglang/srt/managers/mm_utils.py +6 -13
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +61 -25
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +41 -19
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +47 -30
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/mem_cache/allocator.py +61 -87
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +80 -22
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +34 -36
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +29 -9
- sglang/srt/model_executor/forward_batch_info.py +61 -19
- sglang/srt/model_executor/model_runner.py +148 -37
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +137 -59
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +38 -0
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +28 -16
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +1251 -0
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +0 -25
- sglang/srt/models/llama4.py +1 -1
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen2_moe.py +6 -0
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/qwen3_moe.py +32 -6
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/step3_vl.py +9 -0
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/reasoning_parser.py +332 -37
- sglang/srt/server_args.py +186 -75
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +169 -9
- sglang/srt/utils.py +41 -5
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/runners.py +2 -2
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/test/test_utils.py +1 -1
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -14,9 +14,18 @@
|
|
14
14
|
"""Pydantic models for OpenAI API protocol"""
|
15
15
|
|
16
16
|
import time
|
17
|
+
import uuid
|
17
18
|
from dataclasses import dataclass
|
18
|
-
from typing import Any, Dict, List, Optional, Union
|
19
|
+
from typing import Any, Dict, List, Optional, TypeAlias, Union
|
19
20
|
|
21
|
+
from openai.types.responses import (
|
22
|
+
ResponseFunctionToolCall,
|
23
|
+
ResponseInputItemParam,
|
24
|
+
ResponseOutputItem,
|
25
|
+
ResponseReasoningItem,
|
26
|
+
)
|
27
|
+
from openai.types.responses.response import ToolChoice
|
28
|
+
from openai.types.responses.tool import Tool
|
20
29
|
from pydantic import (
|
21
30
|
BaseModel,
|
22
31
|
Field,
|
@@ -84,6 +93,7 @@ class UsageInfo(BaseModel):
|
|
84
93
|
completion_tokens: Optional[int] = 0
|
85
94
|
# only used to return cached tokens when --enable-cache-report is set
|
86
95
|
prompt_tokens_details: Optional[Dict[str, int]] = None
|
96
|
+
reasoning_tokens: Optional[int] = 0
|
87
97
|
|
88
98
|
|
89
99
|
class StreamOptions(BaseModel):
|
@@ -428,6 +438,13 @@ class ChatCompletionRequest(BaseModel):
|
|
428
438
|
default="auto", examples=["none"]
|
429
439
|
) # noqa
|
430
440
|
return_hidden_states: bool = False
|
441
|
+
reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field(
|
442
|
+
default="medium",
|
443
|
+
description="Constrains effort on reasoning for reasoning models. "
|
444
|
+
"'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
|
445
|
+
"result in faster responses and fewer tokens used on reasoning in a response. "
|
446
|
+
"Currently only supported for OpenAI models.",
|
447
|
+
)
|
431
448
|
|
432
449
|
@model_validator(mode="before")
|
433
450
|
@classmethod
|
@@ -619,6 +636,196 @@ OpenAIServingRequest = Union[
|
|
619
636
|
]
|
620
637
|
|
621
638
|
|
639
|
+
# Response API protocol definitions
|
640
|
+
class ResponseReasoningParam(BaseModel):
|
641
|
+
"""Reasoning parameters for responses."""
|
642
|
+
|
643
|
+
effort: Optional[Literal["low", "medium", "high"]] = Field(
|
644
|
+
default="medium",
|
645
|
+
description="Constrains effort on reasoning for reasoning models.",
|
646
|
+
)
|
647
|
+
|
648
|
+
|
649
|
+
class ResponseTool(BaseModel):
|
650
|
+
"""Tool definition for responses."""
|
651
|
+
|
652
|
+
type: Literal["web_search_preview", "code_interpreter"] = Field(
|
653
|
+
description="Type of tool to enable"
|
654
|
+
)
|
655
|
+
|
656
|
+
|
657
|
+
ResponseInputOutputItem: TypeAlias = Union[
|
658
|
+
ResponseInputItemParam,
|
659
|
+
"ResponseReasoningItem",
|
660
|
+
ResponseFunctionToolCall,
|
661
|
+
]
|
662
|
+
|
663
|
+
|
664
|
+
class ResponsesRequest(BaseModel):
|
665
|
+
"""Request body for v1/responses endpoint."""
|
666
|
+
|
667
|
+
# Core OpenAI API fields (ordered by official documentation)
|
668
|
+
background: Optional[bool] = False
|
669
|
+
include: Optional[
|
670
|
+
List[
|
671
|
+
Literal[
|
672
|
+
"code_interpreter_call.outputs",
|
673
|
+
"computer_call_output.output.image_url",
|
674
|
+
"file_search_call.results",
|
675
|
+
"message.input_image.image_url",
|
676
|
+
"message.output_text.logprobs",
|
677
|
+
"reasoning.encrypted_content",
|
678
|
+
]
|
679
|
+
]
|
680
|
+
] = None
|
681
|
+
input: Union[str, List[ResponseInputOutputItem]]
|
682
|
+
instructions: Optional[str] = None
|
683
|
+
max_output_tokens: Optional[int] = None
|
684
|
+
max_tool_calls: Optional[int] = None
|
685
|
+
metadata: Optional[Dict[str, Any]] = None
|
686
|
+
model: Optional[str] = None # Made optional to match vLLM
|
687
|
+
parallel_tool_calls: Optional[bool] = True
|
688
|
+
previous_response_id: Optional[str] = None
|
689
|
+
reasoning: Optional[ResponseReasoningParam] = None
|
690
|
+
service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
|
691
|
+
store: Optional[bool] = True
|
692
|
+
stream: Optional[bool] = False
|
693
|
+
temperature: Optional[float] = None
|
694
|
+
tool_choice: Literal["auto", "required", "none"] = "auto"
|
695
|
+
tools: List[ResponseTool] = Field(default_factory=list)
|
696
|
+
top_logprobs: Optional[int] = 0
|
697
|
+
top_p: Optional[float] = None
|
698
|
+
truncation: Optional[Literal["auto", "disabled"]] = "disabled"
|
699
|
+
user: Optional[str] = None
|
700
|
+
|
701
|
+
# Extra SGLang parameters
|
702
|
+
request_id: str = Field(
|
703
|
+
default_factory=lambda: f"resp_{uuid.uuid4().hex}",
|
704
|
+
description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
|
705
|
+
)
|
706
|
+
priority: int = Field(default=0, description="Request priority")
|
707
|
+
|
708
|
+
# SGLang-specific sampling parameters
|
709
|
+
frequency_penalty: float = 0.0
|
710
|
+
presence_penalty: float = 0.0
|
711
|
+
stop: Optional[Union[str, List[str]]] = None
|
712
|
+
top_k: int = -1
|
713
|
+
min_p: float = 0.0
|
714
|
+
repetition_penalty: float = 1.0
|
715
|
+
|
716
|
+
# Default sampling parameters
|
717
|
+
_DEFAULT_SAMPLING_PARAMS = {
|
718
|
+
"temperature": 0.7,
|
719
|
+
"top_p": 1.0,
|
720
|
+
"top_k": -1,
|
721
|
+
"min_p": 0.0,
|
722
|
+
"repetition_penalty": 1.0,
|
723
|
+
}
|
724
|
+
|
725
|
+
def to_sampling_params(
|
726
|
+
self, default_max_tokens: int, default_params: Optional[Dict] = None
|
727
|
+
) -> Dict[str, Any]:
|
728
|
+
"""Convert to sampling parameters for generation."""
|
729
|
+
if default_params is None:
|
730
|
+
default_params = {}
|
731
|
+
|
732
|
+
# Use max_output_tokens if available, otherwise use max_tokens for backwards compatibility
|
733
|
+
if self.max_output_tokens is not None:
|
734
|
+
max_tokens = min(self.max_output_tokens, default_max_tokens)
|
735
|
+
else:
|
736
|
+
max_tokens = default_max_tokens
|
737
|
+
|
738
|
+
# Avoid exceed the context length by minus 1 token
|
739
|
+
max_tokens -= 1
|
740
|
+
|
741
|
+
# Get parameters with defaults
|
742
|
+
temperature = self.temperature
|
743
|
+
if temperature is None:
|
744
|
+
temperature = default_params.get(
|
745
|
+
"temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
|
746
|
+
)
|
747
|
+
|
748
|
+
top_p = self.top_p
|
749
|
+
if top_p is None:
|
750
|
+
top_p = default_params.get("top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
|
751
|
+
|
752
|
+
params = {
|
753
|
+
"max_new_tokens": max_tokens,
|
754
|
+
"temperature": temperature,
|
755
|
+
"top_p": top_p,
|
756
|
+
"frequency_penalty": self.frequency_penalty,
|
757
|
+
"presence_penalty": self.presence_penalty,
|
758
|
+
"stop": self.stop,
|
759
|
+
"top_k": self.top_k,
|
760
|
+
"min_p": self.min_p,
|
761
|
+
"repetition_penalty": self.repetition_penalty,
|
762
|
+
}
|
763
|
+
|
764
|
+
# Apply any additional default parameters
|
765
|
+
for key, value in default_params.items():
|
766
|
+
if key not in params or params[key] is None:
|
767
|
+
params[key] = value
|
768
|
+
|
769
|
+
return params
|
770
|
+
|
771
|
+
|
772
|
+
class PromptTokenUsageInfo(BaseModel):
|
773
|
+
"""Prompt token usage details."""
|
774
|
+
|
775
|
+
cached_tokens: int = 0
|
776
|
+
|
777
|
+
|
778
|
+
class ResponsesResponse(BaseModel):
|
779
|
+
"""Response body for v1/responses endpoint."""
|
780
|
+
|
781
|
+
id: str = Field(default_factory=lambda: f"resp_{time.time()}")
|
782
|
+
object: Literal["response"] = "response"
|
783
|
+
created_at: int = Field(default_factory=lambda: int(time.time()))
|
784
|
+
model: str
|
785
|
+
|
786
|
+
output: List[
|
787
|
+
Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
|
788
|
+
] = Field(default_factory=list)
|
789
|
+
status: Literal["queued", "in_progress", "completed", "failed", "cancelled"]
|
790
|
+
usage: Optional[UsageInfo] = None
|
791
|
+
parallel_tool_calls: bool = True
|
792
|
+
tool_choice: str = "auto"
|
793
|
+
tools: List[ResponseTool] = Field(default_factory=list)
|
794
|
+
|
795
|
+
@classmethod
|
796
|
+
def from_request(
|
797
|
+
cls,
|
798
|
+
request: ResponsesRequest,
|
799
|
+
sampling_params: Any,
|
800
|
+
model_name: str,
|
801
|
+
created_time: int,
|
802
|
+
output: List[
|
803
|
+
Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
|
804
|
+
],
|
805
|
+
status: str,
|
806
|
+
usage: Optional[UsageInfo],
|
807
|
+
) -> "ResponsesResponse":
|
808
|
+
"""Create a response from a request."""
|
809
|
+
return cls(
|
810
|
+
id=request.request_id,
|
811
|
+
created_at=created_time,
|
812
|
+
model=model_name,
|
813
|
+
output=output,
|
814
|
+
status=status,
|
815
|
+
usage=usage,
|
816
|
+
parallel_tool_calls=request.parallel_tool_calls or True,
|
817
|
+
tool_choice=request.tool_choice,
|
818
|
+
tools=request.tools,
|
819
|
+
)
|
820
|
+
|
821
|
+
|
822
|
+
class RequestResponseMetadata(BaseModel):
|
823
|
+
"""Metadata for request/response tracking."""
|
824
|
+
|
825
|
+
request_id: str
|
826
|
+
final_usage_info: Optional[UsageInfo] = None
|
827
|
+
|
828
|
+
|
622
829
|
@dataclass
|
623
830
|
class MessageProcessingResult:
|
624
831
|
"""Result of processing chat messages and applying templates.
|
@@ -645,3 +852,13 @@ class MessageProcessingResult:
|
|
645
852
|
modalities: List[str]
|
646
853
|
stop: List[str]
|
647
854
|
tool_call_constraint: Optional[Any] = None
|
855
|
+
|
856
|
+
|
857
|
+
class ResponseReasoningTextContent(BaseModel):
|
858
|
+
text: str
|
859
|
+
type: Literal["reasoning_text"] = "reasoning_text"
|
860
|
+
|
861
|
+
|
862
|
+
ResponseInputOutputItem: TypeAlias = Union[
|
863
|
+
ResponseInputItemParam, "ResponseReasoningItem", ResponseFunctionToolCall
|
864
|
+
]
|
@@ -47,7 +47,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
47
47
|
"""Handler for /v1/chat/completions requests"""
|
48
48
|
|
49
49
|
def __init__(
|
50
|
-
self,
|
50
|
+
self,
|
51
|
+
tokenizer_manager: TokenizerManager,
|
52
|
+
template_manager: TemplateManager,
|
51
53
|
):
|
52
54
|
super().__init__(tokenizer_manager)
|
53
55
|
self.template_manager = template_manager
|
@@ -67,6 +69,18 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
67
69
|
):
|
68
70
|
return "Tools cannot be empty if tool choice is set to required."
|
69
71
|
|
72
|
+
max_output_tokens = request.max_completion_tokens or request.max_tokens
|
73
|
+
server_context_length = self.tokenizer_manager.server_args.context_length
|
74
|
+
if (
|
75
|
+
max_output_tokens
|
76
|
+
and server_context_length
|
77
|
+
and max_output_tokens > server_context_length
|
78
|
+
):
|
79
|
+
return (
|
80
|
+
f"max_completion_tokens is too large: {max_output_tokens}."
|
81
|
+
f"This model supports at most {server_context_length} completion tokens."
|
82
|
+
)
|
83
|
+
|
70
84
|
return None
|
71
85
|
|
72
86
|
def _convert_to_internal_request(
|
@@ -81,7 +95,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
81
95
|
|
82
96
|
# Build sampling parameters
|
83
97
|
sampling_params = self._build_sampling_params(
|
84
|
-
request,
|
98
|
+
request,
|
99
|
+
processed_messages.stop,
|
100
|
+
processed_messages.tool_call_constraint,
|
85
101
|
)
|
86
102
|
|
87
103
|
# Handle single vs multiple requests
|
@@ -196,14 +212,15 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
196
212
|
tokenize=True,
|
197
213
|
add_generation_prompt=True,
|
198
214
|
tools=tools,
|
215
|
+
reasoning_effort=request.reasoning_effort,
|
199
216
|
**(
|
200
217
|
request.chat_template_kwargs if request.chat_template_kwargs else {}
|
201
218
|
),
|
202
219
|
)
|
203
220
|
except Exception:
|
204
|
-
#
|
205
|
-
#
|
206
|
-
#
|
221
|
+
# This except branch will be triggered when the chosen model
|
222
|
+
# has a different tools input format that is not compatible
|
223
|
+
# with openAI's apply_chat_template tool_call format, like Mistral.
|
207
224
|
tools = (
|
208
225
|
[t if "function" in t else {"function": t} for t in tools]
|
209
226
|
if tools
|
@@ -214,6 +231,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
214
231
|
tokenize=True,
|
215
232
|
add_generation_prompt=True,
|
216
233
|
tools=tools,
|
234
|
+
reasoning_effort=request.reasoning_effort,
|
217
235
|
**(
|
218
236
|
request.chat_template_kwargs if request.chat_template_kwargs else {}
|
219
237
|
),
|
@@ -277,6 +295,8 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
277
295
|
prompt = prompt[: -len(conv.sep2)]
|
278
296
|
else:
|
279
297
|
prompt = conv.get_prompt()
|
298
|
+
if self._get_enable_thinking_from_request(request):
|
299
|
+
prompt += "<think>" # Note(Xinyuan): hard code thinking token
|
280
300
|
|
281
301
|
image_data = conv.image_data if conv.image_data else None
|
282
302
|
video_data = conv.video_data if conv.video_data else None
|
@@ -448,7 +468,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
448
468
|
)
|
449
469
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
450
470
|
|
451
|
-
# Process content delta
|
452
471
|
stream_buffer = stream_buffers.get(index, "")
|
453
472
|
delta = content["text"][len(stream_buffer) :]
|
454
473
|
stream_buffers[index] = stream_buffer + delta
|
@@ -502,7 +521,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
502
521
|
if delta:
|
503
522
|
choice_data = ChatCompletionResponseStreamChoice(
|
504
523
|
index=index,
|
505
|
-
delta=DeltaMessage(content=delta
|
524
|
+
delta=DeltaMessage(content=delta),
|
506
525
|
finish_reason=None,
|
507
526
|
matched_stop=None,
|
508
527
|
logprobs=choice_logprobs,
|
@@ -645,9 +664,15 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
645
664
|
reasoning_text = None
|
646
665
|
reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
|
647
666
|
if reasoning_parser and request.separate_reasoning:
|
667
|
+
is_force_reasoning = (
|
668
|
+
self.template_manager.force_reasoning
|
669
|
+
or self._get_enable_thinking_from_request(request)
|
670
|
+
)
|
648
671
|
try:
|
649
672
|
parser = ReasoningParser(
|
650
|
-
model_type=reasoning_parser,
|
673
|
+
model_type=reasoning_parser,
|
674
|
+
stream_reasoning=False,
|
675
|
+
force_reasoning=is_force_reasoning,
|
651
676
|
)
|
652
677
|
reasoning_text, text = parser.parse_non_stream(text)
|
653
678
|
except Exception as e:
|
@@ -810,14 +835,19 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
810
835
|
) -> tuple[Optional[str], str]:
|
811
836
|
"""Process reasoning content in streaming response"""
|
812
837
|
if index not in reasoning_parser_dict:
|
838
|
+
is_force_reasoning = (
|
839
|
+
self.template_manager.force_reasoning
|
840
|
+
or self._get_enable_thinking_from_request(request)
|
841
|
+
)
|
813
842
|
reasoning_parser_dict[index] = ReasoningParser(
|
814
843
|
self.tokenizer_manager.server_args.reasoning_parser,
|
815
844
|
request.stream_reasoning,
|
845
|
+
is_force_reasoning,
|
816
846
|
)
|
817
847
|
reasoning_parser = reasoning_parser_dict[index]
|
818
848
|
return reasoning_parser.parse_stream_chunk(delta)
|
819
849
|
|
820
|
-
def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
|
850
|
+
def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
|
821
851
|
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
|
822
852
|
|
823
853
|
NOTE: This parameter is only useful for models that support enable_thinking
|
@@ -826,7 +856,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
826
856
|
Args:
|
827
857
|
request_obj: The request object (or an item from a list of requests).
|
828
858
|
Returns:
|
829
|
-
The boolean value of 'enable_thinking' if found
|
859
|
+
The boolean value of 'enable_thinking' if found, otherwise False.
|
830
860
|
"""
|
831
861
|
if (
|
832
862
|
hasattr(request, "chat_template_kwargs")
|
@@ -834,7 +864,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
834
864
|
and request.chat_template_kwargs.get("enable_thinking") is not None
|
835
865
|
):
|
836
866
|
return request.chat_template_kwargs.get("enable_thinking")
|
837
|
-
return
|
867
|
+
return False
|
838
868
|
|
839
869
|
async def _process_tool_call_stream(
|
840
870
|
self,
|