sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +113 -17
- sglang/compile_deep_gemm.py +8 -1
- sglang/global_config.py +5 -1
- sglang/srt/configs/model_config.py +35 -0
- sglang/srt/conversation.py +9 -117
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +6 -1
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
- sglang/srt/disaggregation/mooncake/conn.py +243 -135
- sglang/srt/disaggregation/prefill.py +3 -0
- sglang/srt/distributed/device_communicators/pynccl.py +7 -0
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
- sglang/srt/distributed/parallel_state.py +22 -9
- sglang/srt/entrypoints/context.py +244 -0
- sglang/srt/entrypoints/engine.py +8 -5
- sglang/srt/entrypoints/harmony_utils.py +370 -0
- sglang/srt/entrypoints/http_server.py +106 -15
- sglang/srt/entrypoints/openai/protocol.py +227 -1
- sglang/srt/entrypoints/openai/serving_chat.py +278 -42
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +174 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_distribution.py +4 -2
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/harmony_tool_parser.py +130 -0
- sglang/srt/hf_transformers_utils.py +55 -13
- sglang/srt/jinja_template_utils.py +8 -1
- sglang/srt/layers/attention/aiter_backend.py +5 -8
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +7 -11
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
- sglang/srt/layers/attention/vision.py +40 -15
- sglang/srt/layers/communicator.py +35 -8
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/linear.py +9 -8
- sglang/srt/layers/logits_processor.py +9 -1
- sglang/srt/layers/moe/cutlass_moe.py +20 -6
- sglang/srt/layers/moe/ep_moe/layer.py +87 -107
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
- sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
- sglang/srt/layers/moe/topk.py +12 -3
- sglang/srt/layers/moe/utils.py +59 -0
- sglang/srt/layers/quantization/__init__.py +22 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +8 -7
- sglang/srt/layers/quantization/fp8_kernel.py +0 -4
- sglang/srt/layers/quantization/fp8_utils.py +29 -0
- sglang/srt/layers/quantization/modelopt_quant.py +259 -64
- sglang/srt/layers/quantization/mxfp4.py +651 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/__init__.py +0 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +1 -1
- sglang/srt/layers/rotary_embedding.py +225 -1
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +15 -4
- sglang/srt/lora/lora_manager.py +70 -14
- sglang/srt/lora/lora_registry.py +10 -2
- sglang/srt/lora/mem_pool.py +43 -5
- sglang/srt/managers/cache_controller.py +61 -32
- sglang/srt/managers/data_parallel_controller.py +52 -2
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +21 -4
- sglang/srt/managers/mm_utils.py +5 -11
- sglang/srt/managers/schedule_batch.py +30 -8
- sglang/srt/managers/schedule_policy.py +3 -1
- sglang/srt/managers/scheduler.py +170 -18
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +59 -22
- sglang/srt/managers/tokenizer_manager.py +137 -67
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/managers/utils.py +45 -1
- sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
- sglang/srt/mem_cache/hicache_storage.py +13 -21
- sglang/srt/mem_cache/hiradix_cache.py +53 -5
- sglang/srt/mem_cache/memory_pool_host.py +1 -1
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
- sglang/srt/model_executor/cuda_graph_runner.py +24 -9
- sglang/srt/model_executor/forward_batch_info.py +48 -17
- sglang/srt/model_executor/model_runner.py +24 -2
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +95 -50
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma3n_mm.py +39 -0
- sglang/srt/models/glm4_moe.py +102 -27
- sglang/srt/models/gpt_oss.py +1134 -0
- sglang/srt/models/grok.py +3 -3
- sglang/srt/models/llama4.py +13 -2
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mllama4.py +428 -19
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_moe.py +7 -4
- sglang/srt/models/qwen3_moe.py +39 -14
- sglang/srt/models/step3_vl.py +10 -1
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +4 -3
- sglang/srt/multimodal/processors/gemma3n.py +0 -7
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/operations_strategy.py +1 -1
- sglang/srt/reasoning_parser.py +18 -39
- sglang/srt/server_args.py +218 -23
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
- sglang/srt/two_batch_overlap.py +163 -9
- sglang/srt/utils.py +41 -26
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/runners.py +4 -4
- sglang/test/test_utils.py +4 -4
- sglang/version.py +1 -1
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
- /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
- /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
@@ -14,9 +14,18 @@
|
|
14
14
|
"""Pydantic models for OpenAI API protocol"""
|
15
15
|
|
16
16
|
import time
|
17
|
+
import uuid
|
17
18
|
from dataclasses import dataclass
|
18
|
-
from typing import Any, Dict, List, Optional, Union
|
19
|
+
from typing import Any, Dict, List, Optional, TypeAlias, Union
|
19
20
|
|
21
|
+
from openai.types.responses import (
|
22
|
+
ResponseFunctionToolCall,
|
23
|
+
ResponseInputItemParam,
|
24
|
+
ResponseOutputItem,
|
25
|
+
ResponseReasoningItem,
|
26
|
+
)
|
27
|
+
from openai.types.responses.response import ToolChoice
|
28
|
+
from openai.types.responses.tool import Tool
|
20
29
|
from pydantic import (
|
21
30
|
BaseModel,
|
22
31
|
Field,
|
@@ -84,6 +93,7 @@ class UsageInfo(BaseModel):
|
|
84
93
|
completion_tokens: Optional[int] = 0
|
85
94
|
# only used to return cached tokens when --enable-cache-report is set
|
86
95
|
prompt_tokens_details: Optional[Dict[str, int]] = None
|
96
|
+
reasoning_tokens: Optional[int] = 0
|
87
97
|
|
88
98
|
|
89
99
|
class StreamOptions(BaseModel):
|
@@ -428,6 +438,13 @@ class ChatCompletionRequest(BaseModel):
|
|
428
438
|
default="auto", examples=["none"]
|
429
439
|
) # noqa
|
430
440
|
return_hidden_states: bool = False
|
441
|
+
reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field(
|
442
|
+
default="medium",
|
443
|
+
description="Constrains effort on reasoning for reasoning models. "
|
444
|
+
"'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
|
445
|
+
"result in faster responses and fewer tokens used on reasoning in a response. "
|
446
|
+
"Currently only supported for OpenAI models.",
|
447
|
+
)
|
431
448
|
|
432
449
|
@model_validator(mode="before")
|
433
450
|
@classmethod
|
@@ -619,6 +636,196 @@ OpenAIServingRequest = Union[
|
|
619
636
|
]
|
620
637
|
|
621
638
|
|
639
|
+
# Response API protocol definitions
|
640
|
+
class ResponseReasoningParam(BaseModel):
|
641
|
+
"""Reasoning parameters for responses."""
|
642
|
+
|
643
|
+
effort: Optional[Literal["low", "medium", "high"]] = Field(
|
644
|
+
default="medium",
|
645
|
+
description="Constrains effort on reasoning for reasoning models.",
|
646
|
+
)
|
647
|
+
|
648
|
+
|
649
|
+
class ResponseTool(BaseModel):
|
650
|
+
"""Tool definition for responses."""
|
651
|
+
|
652
|
+
type: Literal["web_search_preview", "code_interpreter"] = Field(
|
653
|
+
description="Type of tool to enable"
|
654
|
+
)
|
655
|
+
|
656
|
+
|
657
|
+
ResponseInputOutputItem: TypeAlias = Union[
|
658
|
+
ResponseInputItemParam,
|
659
|
+
"ResponseReasoningItem",
|
660
|
+
ResponseFunctionToolCall,
|
661
|
+
]
|
662
|
+
|
663
|
+
|
664
|
+
class ResponsesRequest(BaseModel):
|
665
|
+
"""Request body for v1/responses endpoint."""
|
666
|
+
|
667
|
+
# Core OpenAI API fields (ordered by official documentation)
|
668
|
+
background: Optional[bool] = False
|
669
|
+
include: Optional[
|
670
|
+
List[
|
671
|
+
Literal[
|
672
|
+
"code_interpreter_call.outputs",
|
673
|
+
"computer_call_output.output.image_url",
|
674
|
+
"file_search_call.results",
|
675
|
+
"message.input_image.image_url",
|
676
|
+
"message.output_text.logprobs",
|
677
|
+
"reasoning.encrypted_content",
|
678
|
+
]
|
679
|
+
]
|
680
|
+
] = None
|
681
|
+
input: Union[str, List[ResponseInputOutputItem]]
|
682
|
+
instructions: Optional[str] = None
|
683
|
+
max_output_tokens: Optional[int] = None
|
684
|
+
max_tool_calls: Optional[int] = None
|
685
|
+
metadata: Optional[Dict[str, Any]] = None
|
686
|
+
model: Optional[str] = None # Made optional to match vLLM
|
687
|
+
parallel_tool_calls: Optional[bool] = True
|
688
|
+
previous_response_id: Optional[str] = None
|
689
|
+
reasoning: Optional[ResponseReasoningParam] = None
|
690
|
+
service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
|
691
|
+
store: Optional[bool] = True
|
692
|
+
stream: Optional[bool] = False
|
693
|
+
temperature: Optional[float] = None
|
694
|
+
tool_choice: Literal["auto", "required", "none"] = "auto"
|
695
|
+
tools: List[ResponseTool] = Field(default_factory=list)
|
696
|
+
top_logprobs: Optional[int] = 0
|
697
|
+
top_p: Optional[float] = None
|
698
|
+
truncation: Optional[Literal["auto", "disabled"]] = "disabled"
|
699
|
+
user: Optional[str] = None
|
700
|
+
|
701
|
+
# Extra SGLang parameters
|
702
|
+
request_id: str = Field(
|
703
|
+
default_factory=lambda: f"resp_{uuid.uuid4().hex}",
|
704
|
+
description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
|
705
|
+
)
|
706
|
+
priority: int = Field(default=0, description="Request priority")
|
707
|
+
|
708
|
+
# SGLang-specific sampling parameters
|
709
|
+
frequency_penalty: float = 0.0
|
710
|
+
presence_penalty: float = 0.0
|
711
|
+
stop: Optional[Union[str, List[str]]] = None
|
712
|
+
top_k: int = -1
|
713
|
+
min_p: float = 0.0
|
714
|
+
repetition_penalty: float = 1.0
|
715
|
+
|
716
|
+
# Default sampling parameters
|
717
|
+
_DEFAULT_SAMPLING_PARAMS = {
|
718
|
+
"temperature": 0.7,
|
719
|
+
"top_p": 1.0,
|
720
|
+
"top_k": -1,
|
721
|
+
"min_p": 0.0,
|
722
|
+
"repetition_penalty": 1.0,
|
723
|
+
}
|
724
|
+
|
725
|
+
def to_sampling_params(
|
726
|
+
self, default_max_tokens: int, default_params: Optional[Dict] = None
|
727
|
+
) -> Dict[str, Any]:
|
728
|
+
"""Convert to sampling parameters for generation."""
|
729
|
+
if default_params is None:
|
730
|
+
default_params = {}
|
731
|
+
|
732
|
+
# Use max_output_tokens if available, otherwise use max_tokens for backwards compatibility
|
733
|
+
if self.max_output_tokens is not None:
|
734
|
+
max_tokens = min(self.max_output_tokens, default_max_tokens)
|
735
|
+
else:
|
736
|
+
max_tokens = default_max_tokens
|
737
|
+
|
738
|
+
# Avoid exceed the context length by minus 1 token
|
739
|
+
max_tokens -= 1
|
740
|
+
|
741
|
+
# Get parameters with defaults
|
742
|
+
temperature = self.temperature
|
743
|
+
if temperature is None:
|
744
|
+
temperature = default_params.get(
|
745
|
+
"temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
|
746
|
+
)
|
747
|
+
|
748
|
+
top_p = self.top_p
|
749
|
+
if top_p is None:
|
750
|
+
top_p = default_params.get("top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
|
751
|
+
|
752
|
+
params = {
|
753
|
+
"max_new_tokens": max_tokens,
|
754
|
+
"temperature": temperature,
|
755
|
+
"top_p": top_p,
|
756
|
+
"frequency_penalty": self.frequency_penalty,
|
757
|
+
"presence_penalty": self.presence_penalty,
|
758
|
+
"stop": self.stop,
|
759
|
+
"top_k": self.top_k,
|
760
|
+
"min_p": self.min_p,
|
761
|
+
"repetition_penalty": self.repetition_penalty,
|
762
|
+
}
|
763
|
+
|
764
|
+
# Apply any additional default parameters
|
765
|
+
for key, value in default_params.items():
|
766
|
+
if key not in params or params[key] is None:
|
767
|
+
params[key] = value
|
768
|
+
|
769
|
+
return params
|
770
|
+
|
771
|
+
|
772
|
+
class PromptTokenUsageInfo(BaseModel):
|
773
|
+
"""Prompt token usage details."""
|
774
|
+
|
775
|
+
cached_tokens: int = 0
|
776
|
+
|
777
|
+
|
778
|
+
class ResponsesResponse(BaseModel):
|
779
|
+
"""Response body for v1/responses endpoint."""
|
780
|
+
|
781
|
+
id: str = Field(default_factory=lambda: f"resp_{time.time()}")
|
782
|
+
object: Literal["response"] = "response"
|
783
|
+
created_at: int = Field(default_factory=lambda: int(time.time()))
|
784
|
+
model: str
|
785
|
+
|
786
|
+
output: List[
|
787
|
+
Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
|
788
|
+
] = Field(default_factory=list)
|
789
|
+
status: Literal["queued", "in_progress", "completed", "failed", "cancelled"]
|
790
|
+
usage: Optional[UsageInfo] = None
|
791
|
+
parallel_tool_calls: bool = True
|
792
|
+
tool_choice: str = "auto"
|
793
|
+
tools: List[ResponseTool] = Field(default_factory=list)
|
794
|
+
|
795
|
+
@classmethod
|
796
|
+
def from_request(
|
797
|
+
cls,
|
798
|
+
request: ResponsesRequest,
|
799
|
+
sampling_params: Any,
|
800
|
+
model_name: str,
|
801
|
+
created_time: int,
|
802
|
+
output: List[
|
803
|
+
Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
|
804
|
+
],
|
805
|
+
status: str,
|
806
|
+
usage: Optional[UsageInfo],
|
807
|
+
) -> "ResponsesResponse":
|
808
|
+
"""Create a response from a request."""
|
809
|
+
return cls(
|
810
|
+
id=request.request_id,
|
811
|
+
created_at=created_time,
|
812
|
+
model=model_name,
|
813
|
+
output=output,
|
814
|
+
status=status,
|
815
|
+
usage=usage,
|
816
|
+
parallel_tool_calls=request.parallel_tool_calls or True,
|
817
|
+
tool_choice=request.tool_choice,
|
818
|
+
tools=request.tools,
|
819
|
+
)
|
820
|
+
|
821
|
+
|
822
|
+
class RequestResponseMetadata(BaseModel):
|
823
|
+
"""Metadata for request/response tracking."""
|
824
|
+
|
825
|
+
request_id: str
|
826
|
+
final_usage_info: Optional[UsageInfo] = None
|
827
|
+
|
828
|
+
|
622
829
|
@dataclass
|
623
830
|
class MessageProcessingResult:
|
624
831
|
"""Result of processing chat messages and applying templates.
|
@@ -645,3 +852,22 @@ class MessageProcessingResult:
|
|
645
852
|
modalities: List[str]
|
646
853
|
stop: List[str]
|
647
854
|
tool_call_constraint: Optional[Any] = None
|
855
|
+
|
856
|
+
|
857
|
+
class ResponseReasoningTextContent(BaseModel):
|
858
|
+
text: str
|
859
|
+
type: Literal["reasoning_text"] = "reasoning_text"
|
860
|
+
|
861
|
+
|
862
|
+
class ResponseReasoningItem(BaseModel):
|
863
|
+
id: str
|
864
|
+
content: list[ResponseReasoningTextContent] = Field(default_factory=list)
|
865
|
+
summary: list = Field(default_factory=list)
|
866
|
+
type: Literal["reasoning"] = "reasoning"
|
867
|
+
encrypted_content: Optional[str] = None
|
868
|
+
status: Optional[Literal["in_progress", "completed", "incomplete"]]
|
869
|
+
|
870
|
+
|
871
|
+
ResponseInputOutputItem: TypeAlias = Union[
|
872
|
+
ResponseInputItemParam, "ResponseReasoningItem", ResponseFunctionToolCall
|
873
|
+
]
|
@@ -7,8 +7,18 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
|
7
7
|
|
8
8
|
from fastapi import Request
|
9
9
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
10
|
+
from openai_harmony import Message as OpenAIMessage
|
10
11
|
|
11
12
|
from sglang.srt.conversation import generate_chat_conv
|
13
|
+
from sglang.srt.entrypoints.harmony_utils import (
|
14
|
+
get_developer_message,
|
15
|
+
get_stop_tokens_for_assistant_actions,
|
16
|
+
get_streamable_parser_for_assistant,
|
17
|
+
get_system_message,
|
18
|
+
parse_chat_input,
|
19
|
+
parse_output_into_messages,
|
20
|
+
render_for_completion,
|
21
|
+
)
|
12
22
|
from sglang.srt.entrypoints.openai.protocol import (
|
13
23
|
ChatCompletionRequest,
|
14
24
|
ChatCompletionResponse,
|
@@ -51,6 +61,26 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
51
61
|
):
|
52
62
|
super().__init__(tokenizer_manager)
|
53
63
|
self.template_manager = template_manager
|
64
|
+
self.use_harmony = (
|
65
|
+
self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
|
66
|
+
)
|
67
|
+
|
68
|
+
if self.use_harmony:
|
69
|
+
from sglang.srt.function_call.harmony_tool_parser import (
|
70
|
+
HarmonyToolCallParser,
|
71
|
+
)
|
72
|
+
|
73
|
+
self.harmony_tool_parser = HarmonyToolCallParser()
|
74
|
+
|
75
|
+
# NOTE While OpenAI's chat completion API supports browsing
|
76
|
+
# for some models, currently vLLM doesn't support it. Please use the
|
77
|
+
# Responses API instead.
|
78
|
+
self.supports_browsing = False
|
79
|
+
self.browser_tool = None
|
80
|
+
# NOTE: Chat completion API does not support code interpreter.
|
81
|
+
# Please use the Responses API instead.
|
82
|
+
self.supports_code_interpreter = False
|
83
|
+
self.python_tool = None
|
54
84
|
|
55
85
|
def _request_id_prefix(self) -> str:
|
56
86
|
return "chatcmpl-"
|
@@ -77,41 +107,66 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
77
107
|
is_multimodal = self.tokenizer_manager.model_config.is_multimodal
|
78
108
|
|
79
109
|
# Process messages and apply chat template
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
110
|
+
if not self.use_harmony:
|
111
|
+
processed_messages = self._process_messages(request, is_multimodal)
|
112
|
+
|
113
|
+
# Build sampling parameters
|
114
|
+
sampling_params = self._build_sampling_params(
|
115
|
+
request,
|
116
|
+
processed_messages.stop,
|
117
|
+
processed_messages.tool_call_constraint,
|
118
|
+
)
|
86
119
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
else:
|
91
|
-
if isinstance(processed_messages.prompt_ids, str):
|
92
|
-
prompt_kwargs = {"text": processed_messages.prompt_ids}
|
120
|
+
# Handle single vs multiple requests
|
121
|
+
if is_multimodal:
|
122
|
+
prompt_kwargs = {"text": processed_messages.prompt}
|
93
123
|
else:
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
124
|
+
if isinstance(processed_messages.prompt_ids, str):
|
125
|
+
prompt_kwargs = {"text": processed_messages.prompt_ids}
|
126
|
+
else:
|
127
|
+
prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
|
128
|
+
|
129
|
+
adapted_request = GenerateReqInput(
|
130
|
+
**prompt_kwargs,
|
131
|
+
image_data=processed_messages.image_data,
|
132
|
+
video_data=processed_messages.video_data,
|
133
|
+
audio_data=processed_messages.audio_data,
|
134
|
+
sampling_params=sampling_params,
|
135
|
+
return_logprob=request.logprobs,
|
136
|
+
logprob_start_len=-1,
|
137
|
+
top_logprobs_num=request.top_logprobs or 0,
|
138
|
+
stream=request.stream,
|
139
|
+
return_text_in_logprobs=True,
|
140
|
+
modalities=processed_messages.modalities,
|
141
|
+
lora_path=request.lora_path,
|
142
|
+
bootstrap_host=request.bootstrap_host,
|
143
|
+
bootstrap_port=request.bootstrap_port,
|
144
|
+
bootstrap_room=request.bootstrap_room,
|
145
|
+
return_hidden_states=request.return_hidden_states,
|
146
|
+
rid=request.rid,
|
147
|
+
)
|
148
|
+
else:
|
149
|
+
processed_messages, prompt_ids = self._make_request_with_harmony(request)
|
150
|
+
|
151
|
+
adapted_request = GenerateReqInput(
|
152
|
+
input_ids=prompt_ids,
|
153
|
+
sampling_params=self._build_sampling_params(
|
154
|
+
request,
|
155
|
+
request.stop,
|
156
|
+
tool_call_constraint=None,
|
157
|
+
),
|
158
|
+
stream=request.stream,
|
159
|
+
return_logprob=request.logprobs,
|
160
|
+
logprob_start_len=-1,
|
161
|
+
top_logprobs_num=request.top_logprobs or 0,
|
162
|
+
return_text_in_logprobs=True,
|
163
|
+
lora_path=request.lora_path,
|
164
|
+
bootstrap_host=request.bootstrap_host,
|
165
|
+
bootstrap_port=request.bootstrap_port,
|
166
|
+
bootstrap_room=request.bootstrap_room,
|
167
|
+
return_hidden_states=request.return_hidden_states,
|
168
|
+
rid=request.rid,
|
169
|
+
)
|
115
170
|
|
116
171
|
return adapted_request, request
|
117
172
|
|
@@ -277,6 +332,8 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
277
332
|
prompt = prompt[: -len(conv.sep2)]
|
278
333
|
else:
|
279
334
|
prompt = conv.get_prompt()
|
335
|
+
if self._get_enable_thinking_from_request(request):
|
336
|
+
prompt += "<think>" # Note(Xinyuan): hard code thinking token
|
280
337
|
|
281
338
|
image_data = conv.image_data if conv.image_data else None
|
282
339
|
video_data = conv.video_data if conv.video_data else None
|
@@ -402,6 +459,12 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
402
459
|
cached_tokens = {}
|
403
460
|
hidden_states = {}
|
404
461
|
|
462
|
+
# Harmony tracking
|
463
|
+
if self.use_harmony:
|
464
|
+
harmony_parsers = [
|
465
|
+
get_streamable_parser_for_assistant() for _ in range(request.n)
|
466
|
+
]
|
467
|
+
|
405
468
|
try:
|
406
469
|
async for content in self.tokenizer_manager.generate_request(
|
407
470
|
adapted_request, raw_request
|
@@ -449,14 +512,57 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
449
512
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
450
513
|
|
451
514
|
# Process content delta
|
452
|
-
|
453
|
-
|
454
|
-
|
515
|
+
if self.use_harmony:
|
516
|
+
harmony_parser = harmony_parsers[index]
|
517
|
+
|
518
|
+
new_token_ids = content["output_ids"]
|
519
|
+
for token_id in new_token_ids:
|
520
|
+
harmony_parser.process(token_id)
|
521
|
+
|
522
|
+
is_final = harmony_parser.current_channel == "final"
|
523
|
+
is_analysis = harmony_parser.current_channel == "analysis"
|
524
|
+
delta = harmony_parser.last_content_delta or ""
|
525
|
+
|
526
|
+
if is_analysis:
|
527
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
528
|
+
index=index,
|
529
|
+
delta=DeltaMessage(reasoning_content=delta),
|
530
|
+
finish_reason=None,
|
531
|
+
)
|
532
|
+
chunk = ChatCompletionStreamResponse(
|
533
|
+
id=content["meta_info"]["id"],
|
534
|
+
created=int(time.time()),
|
535
|
+
choices=[choice_data],
|
536
|
+
model=request.model,
|
537
|
+
)
|
538
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
539
|
+
continue
|
540
|
+
|
541
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
542
|
+
index=index,
|
543
|
+
delta=DeltaMessage(content=delta if delta else None),
|
544
|
+
finish_reason=None,
|
545
|
+
matched_stop=None,
|
546
|
+
logprobs=choice_logprobs,
|
547
|
+
)
|
548
|
+
chunk = ChatCompletionStreamResponse(
|
549
|
+
id=content["meta_info"]["id"],
|
550
|
+
created=int(time.time()),
|
551
|
+
choices=[choice_data],
|
552
|
+
model=request.model,
|
553
|
+
)
|
554
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
555
|
+
continue
|
556
|
+
else:
|
557
|
+
stream_buffer = stream_buffers.get(index, "")
|
558
|
+
delta = content["text"][len(stream_buffer) :]
|
559
|
+
stream_buffers[index] = stream_buffer + delta
|
455
560
|
|
456
561
|
# Handle reasoning content
|
457
562
|
if (
|
458
563
|
self.tokenizer_manager.server_args.reasoning_parser
|
459
564
|
and request.separate_reasoning
|
565
|
+
and not self.use_harmony
|
460
566
|
):
|
461
567
|
reasoning_text, delta = self._process_reasoning_stream(
|
462
568
|
index, delta, reasoning_parser_dict, content, request
|
@@ -475,8 +581,27 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
475
581
|
)
|
476
582
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
477
583
|
|
584
|
+
if self.use_harmony and not is_final:
|
585
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
586
|
+
index=index,
|
587
|
+
delta=DeltaMessage(reasoning_content=delta),
|
588
|
+
finish_reason=None,
|
589
|
+
)
|
590
|
+
chunk = ChatCompletionStreamResponse(
|
591
|
+
id=content["meta_info"]["id"],
|
592
|
+
created=int(time.time()),
|
593
|
+
choices=[choice_data],
|
594
|
+
model=request.model,
|
595
|
+
)
|
596
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
597
|
+
|
478
598
|
# Handle tool calls
|
479
|
-
|
599
|
+
# TODO: support tool call parsing for harmony
|
600
|
+
if (
|
601
|
+
request.tool_choice != "none"
|
602
|
+
and request.tools
|
603
|
+
and not self.use_harmony
|
604
|
+
):
|
480
605
|
async for chunk in self._process_tool_call_stream(
|
481
606
|
index,
|
482
607
|
delta,
|
@@ -502,7 +627,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
502
627
|
if delta:
|
503
628
|
choice_data = ChatCompletionResponseStreamChoice(
|
504
629
|
index=index,
|
505
|
-
delta=DeltaMessage(content=delta
|
630
|
+
delta=DeltaMessage(content=delta),
|
506
631
|
finish_reason=None,
|
507
632
|
matched_stop=None,
|
508
633
|
logprobs=choice_logprobs,
|
@@ -640,14 +765,90 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
640
765
|
|
641
766
|
finish_reason = ret_item["meta_info"]["finish_reason"]
|
642
767
|
text = ret_item["text"]
|
768
|
+
output_ids = ret_item["output_ids"]
|
769
|
+
|
770
|
+
if self.use_harmony:
|
771
|
+
parser = parse_output_into_messages(output_ids)
|
772
|
+
output_msgs = parser.messages
|
773
|
+
if len(output_msgs) == 0:
|
774
|
+
# The generation has stopped during reasoning.
|
775
|
+
is_tool_call = False
|
776
|
+
reasoning_content = parser.current_content
|
777
|
+
final_content = None
|
778
|
+
elif len(output_msgs) == 1:
|
779
|
+
# The generation has stopped during final message.
|
780
|
+
is_tool_call = False
|
781
|
+
reasoning_content = output_msgs[0].content[0].text
|
782
|
+
final_content = parser.current_content
|
783
|
+
else:
|
784
|
+
if len(output_msgs) != 2:
|
785
|
+
raise ValueError(
|
786
|
+
"Expected 2 output messages (reasoning and final), "
|
787
|
+
f"but got {len(output_msgs)}."
|
788
|
+
)
|
789
|
+
reasoning_msg, final_msg = output_msgs
|
790
|
+
reasoning_content = reasoning_msg.content[0].text
|
791
|
+
final_content = final_msg.content[0].text
|
792
|
+
is_tool_call = final_msg.recipient is not None
|
793
|
+
|
794
|
+
if is_tool_call:
|
795
|
+
# Extract tool call information from final message
|
796
|
+
tool_call = (
|
797
|
+
self.harmony_tool_parser.extract_tool_calls_from_message(
|
798
|
+
final_msg
|
799
|
+
)
|
800
|
+
)
|
801
|
+
tool_calls = [tool_call] if tool_call else []
|
802
|
+
|
803
|
+
message = ChatMessage(
|
804
|
+
role="assistant",
|
805
|
+
reasoning_content=reasoning_content,
|
806
|
+
content=None, # Tool calls don't have regular content
|
807
|
+
tool_calls=tool_calls,
|
808
|
+
)
|
809
|
+
else:
|
810
|
+
# Normal message
|
811
|
+
message = ChatMessage(
|
812
|
+
role="assistant",
|
813
|
+
reasoning_content=reasoning_content,
|
814
|
+
content=final_content,
|
815
|
+
)
|
816
|
+
|
817
|
+
if is_tool_call:
|
818
|
+
finish_reason_type = "tool_calls"
|
819
|
+
elif finish_reason:
|
820
|
+
finish_reason_type = (
|
821
|
+
finish_reason["type"] if finish_reason else "stop"
|
822
|
+
)
|
823
|
+
else:
|
824
|
+
finish_reason_type = "stop"
|
825
|
+
choice_data = ChatCompletionResponseChoice(
|
826
|
+
index=idx,
|
827
|
+
message=message,
|
828
|
+
logprobs=choice_logprobs,
|
829
|
+
finish_reason=finish_reason_type,
|
830
|
+
matched_stop=(
|
831
|
+
finish_reason["matched"]
|
832
|
+
if finish_reason and "matched" in finish_reason
|
833
|
+
else None
|
834
|
+
),
|
835
|
+
)
|
836
|
+
choices.append(choice_data)
|
837
|
+
continue
|
643
838
|
|
644
839
|
# Handle reasoning content
|
645
840
|
reasoning_text = None
|
646
841
|
reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
|
647
842
|
if reasoning_parser and request.separate_reasoning:
|
843
|
+
is_force_reasoning = (
|
844
|
+
self.template_manager.force_reasoning
|
845
|
+
or self._get_enable_thinking_from_request(request)
|
846
|
+
)
|
648
847
|
try:
|
649
848
|
parser = ReasoningParser(
|
650
|
-
model_type=reasoning_parser,
|
849
|
+
model_type=reasoning_parser,
|
850
|
+
stream_reasoning=False,
|
851
|
+
force_reasoning=is_force_reasoning,
|
651
852
|
)
|
652
853
|
reasoning_text, text = parser.parse_non_stream(text)
|
653
854
|
except Exception as e:
|
@@ -810,14 +1011,19 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
810
1011
|
) -> tuple[Optional[str], str]:
|
811
1012
|
"""Process reasoning content in streaming response"""
|
812
1013
|
if index not in reasoning_parser_dict:
|
1014
|
+
is_force_reasoning = (
|
1015
|
+
self.template_manager.force_reasoning
|
1016
|
+
or self._get_enable_thinking_from_request(request)
|
1017
|
+
)
|
813
1018
|
reasoning_parser_dict[index] = ReasoningParser(
|
814
1019
|
self.tokenizer_manager.server_args.reasoning_parser,
|
815
1020
|
request.stream_reasoning,
|
1021
|
+
is_force_reasoning,
|
816
1022
|
)
|
817
1023
|
reasoning_parser = reasoning_parser_dict[index]
|
818
1024
|
return reasoning_parser.parse_stream_chunk(delta)
|
819
1025
|
|
820
|
-
def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
|
1026
|
+
def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
|
821
1027
|
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
|
822
1028
|
|
823
1029
|
NOTE: This parameter is only useful for models that support enable_thinking
|
@@ -826,7 +1032,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
826
1032
|
Args:
|
827
1033
|
request_obj: The request object (or an item from a list of requests).
|
828
1034
|
Returns:
|
829
|
-
The boolean value of 'enable_thinking' if found
|
1035
|
+
The boolean value of 'enable_thinking' if found, otherwise False.
|
830
1036
|
"""
|
831
1037
|
if (
|
832
1038
|
hasattr(request, "chat_template_kwargs")
|
@@ -834,7 +1040,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
834
1040
|
and request.chat_template_kwargs.get("enable_thinking") is not None
|
835
1041
|
):
|
836
1042
|
return request.chat_template_kwargs.get("enable_thinking")
|
837
|
-
return
|
1043
|
+
return False
|
838
1044
|
|
839
1045
|
async def _process_tool_call_stream(
|
840
1046
|
self,
|
@@ -978,3 +1184,33 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
978
1184
|
return f"data: {chunk.model_dump_json()}\n\n"
|
979
1185
|
|
980
1186
|
return None
|
1187
|
+
|
1188
|
+
def _make_request_with_harmony(
|
1189
|
+
self,
|
1190
|
+
request: ChatCompletionRequest,
|
1191
|
+
):
|
1192
|
+
messages: list[OpenAIMessage] = []
|
1193
|
+
|
1194
|
+
# Add system message.
|
1195
|
+
# In Chat Completion API, browsing is enabled by default if the model
|
1196
|
+
# supports it.
|
1197
|
+
assert not self.supports_browsing
|
1198
|
+
assert not self.supports_code_interpreter
|
1199
|
+
sys_msg = get_system_message(
|
1200
|
+
reasoning_effort=request.reasoning_effort,
|
1201
|
+
browser_description=None,
|
1202
|
+
python_description=None,
|
1203
|
+
)
|
1204
|
+
messages.append(sys_msg)
|
1205
|
+
|
1206
|
+
# Add developer message.
|
1207
|
+
dev_msg = get_developer_message()
|
1208
|
+
messages.append(dev_msg)
|
1209
|
+
|
1210
|
+
# Add user message.
|
1211
|
+
for chat_msg in request.messages:
|
1212
|
+
messages.append(parse_chat_input(chat_msg))
|
1213
|
+
|
1214
|
+
# Render prompt token ids.
|
1215
|
+
prompt_token_ids = render_for_completion(messages)
|
1216
|
+
return messages, prompt_token_ids
|