sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. sglang/bench_one_batch.py +113 -17
  2. sglang/srt/configs/model_config.py +35 -0
  3. sglang/srt/conversation.py +9 -5
  4. sglang/srt/disaggregation/base/conn.py +5 -2
  5. sglang/srt/disaggregation/decode.py +6 -1
  6. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
  7. sglang/srt/disaggregation/mooncake/conn.py +243 -135
  8. sglang/srt/disaggregation/prefill.py +2 -0
  9. sglang/srt/distributed/parallel_state.py +11 -9
  10. sglang/srt/entrypoints/context.py +244 -0
  11. sglang/srt/entrypoints/engine.py +4 -3
  12. sglang/srt/entrypoints/harmony_utils.py +370 -0
  13. sglang/srt/entrypoints/http_server.py +71 -0
  14. sglang/srt/entrypoints/openai/protocol.py +227 -1
  15. sglang/srt/entrypoints/openai/serving_chat.py +278 -42
  16. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  17. sglang/srt/entrypoints/openai/tool_server.py +174 -0
  18. sglang/srt/entrypoints/tool.py +87 -0
  19. sglang/srt/eplb/expert_location.py +5 -1
  20. sglang/srt/function_call/harmony_tool_parser.py +130 -0
  21. sglang/srt/hf_transformers_utils.py +30 -3
  22. sglang/srt/jinja_template_utils.py +8 -1
  23. sglang/srt/layers/attention/aiter_backend.py +5 -8
  24. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  25. sglang/srt/layers/attention/triton_backend.py +85 -14
  26. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  27. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  28. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  29. sglang/srt/layers/attention/vision.py +13 -5
  30. sglang/srt/layers/communicator.py +21 -4
  31. sglang/srt/layers/dp_attention.py +12 -0
  32. sglang/srt/layers/linear.py +2 -7
  33. sglang/srt/layers/moe/cutlass_moe.py +20 -6
  34. sglang/srt/layers/moe/ep_moe/layer.py +77 -73
  35. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  36. sglang/srt/layers/moe/fused_moe_triton/layer.py +416 -35
  37. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
  38. sglang/srt/layers/moe/topk.py +12 -3
  39. sglang/srt/layers/moe/utils.py +16 -0
  40. sglang/srt/layers/quantization/__init__.py +22 -0
  41. sglang/srt/layers/quantization/fp4.py +557 -0
  42. sglang/srt/layers/quantization/fp8.py +3 -6
  43. sglang/srt/layers/quantization/fp8_utils.py +29 -0
  44. sglang/srt/layers/quantization/modelopt_quant.py +259 -64
  45. sglang/srt/layers/quantization/mxfp4.py +651 -0
  46. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  47. sglang/srt/layers/quantization/quark/__init__.py +0 -0
  48. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  49. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  50. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  51. sglang/srt/layers/quantization/quark/utils.py +107 -0
  52. sglang/srt/layers/quantization/unquant.py +60 -6
  53. sglang/srt/layers/quantization/w4afp8.py +1 -1
  54. sglang/srt/layers/rotary_embedding.py +225 -1
  55. sglang/srt/layers/utils.py +9 -0
  56. sglang/srt/layers/vocab_parallel_embedding.py +8 -3
  57. sglang/srt/lora/lora_manager.py +70 -14
  58. sglang/srt/lora/lora_registry.py +3 -2
  59. sglang/srt/lora/mem_pool.py +43 -5
  60. sglang/srt/managers/cache_controller.py +55 -30
  61. sglang/srt/managers/detokenizer_manager.py +1 -1
  62. sglang/srt/managers/io_struct.py +15 -3
  63. sglang/srt/managers/mm_utils.py +5 -11
  64. sglang/srt/managers/schedule_batch.py +28 -7
  65. sglang/srt/managers/scheduler.py +26 -12
  66. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  67. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  68. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  69. sglang/srt/managers/template_manager.py +35 -1
  70. sglang/srt/managers/tokenizer_manager.py +24 -6
  71. sglang/srt/managers/tp_worker.py +3 -0
  72. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  73. sglang/srt/mem_cache/hiradix_cache.py +53 -5
  74. sglang/srt/mem_cache/memory_pool_host.py +1 -1
  75. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  76. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  77. sglang/srt/model_executor/cuda_graph_runner.py +7 -6
  78. sglang/srt/model_executor/forward_batch_info.py +35 -14
  79. sglang/srt/model_executor/model_runner.py +19 -2
  80. sglang/srt/model_loader/weight_utils.py +10 -0
  81. sglang/srt/models/bailing_moe.py +425 -0
  82. sglang/srt/models/deepseek_v2.py +72 -33
  83. sglang/srt/models/ernie4.py +426 -0
  84. sglang/srt/models/ernie4_eagle.py +203 -0
  85. sglang/srt/models/gemma3n_mm.py +39 -0
  86. sglang/srt/models/glm4_moe.py +24 -12
  87. sglang/srt/models/gpt_oss.py +1134 -0
  88. sglang/srt/models/qwen2.py +6 -0
  89. sglang/srt/models/qwen2_moe.py +6 -0
  90. sglang/srt/models/qwen3_moe.py +32 -6
  91. sglang/srt/models/step3_vl.py +9 -0
  92. sglang/srt/models/transformers.py +2 -5
  93. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  94. sglang/srt/reasoning_parser.py +18 -39
  95. sglang/srt/server_args.py +142 -7
  96. sglang/srt/two_batch_overlap.py +157 -5
  97. sglang/srt/utils.py +38 -2
  98. sglang/test/runners.py +2 -2
  99. sglang/test/test_utils.py +1 -1
  100. sglang/version.py +1 -1
  101. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +16 -14
  102. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +105 -84
  103. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
  104. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
  105. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
@@ -32,6 +32,7 @@ from typing import AsyncIterator, Callable, Dict, Optional
32
32
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
33
33
 
34
34
  from contextlib import asynccontextmanager
35
+ from typing import AsyncGenerator
35
36
 
36
37
  import numpy as np
37
38
  import orjson
@@ -56,6 +57,7 @@ from sglang.srt.entrypoints.openai.protocol import (
56
57
  ErrorResponse,
57
58
  ModelCard,
58
59
  ModelList,
60
+ ResponsesRequest,
59
61
  ScoringRequest,
60
62
  V1RerankReqInput,
61
63
  )
@@ -147,6 +149,37 @@ async def lifespan(fast_api_app: FastAPI):
147
149
  )
148
150
 
149
151
  server_args: ServerArgs = fast_api_app.server_args
152
+
153
+ tool_server = None
154
+ if server_args.tool_server == "demo":
155
+ from sglang.srt.entrypoints.openai.tool_server import DemoToolServer
156
+
157
+ tool_server = DemoToolServer()
158
+ elif server_args.tool_server:
159
+ from sglang.srt.entrypoints.openai.tool_server import MCPToolServer
160
+
161
+ tool_server = MCPToolServer()
162
+ await tool_server.add_tool_server(server_args.tool_server)
163
+
164
+ try:
165
+ from sglang.srt.entrypoints.openai.serving_responses import (
166
+ OpenAIServingResponses,
167
+ )
168
+
169
+ fast_api_app.state.openai_serving_responses = OpenAIServingResponses(
170
+ _global_state.tokenizer_manager,
171
+ _global_state.template_manager,
172
+ enable_prompt_tokens_details=True,
173
+ enable_force_include_usage=True,
174
+ tool_server=tool_server,
175
+ )
176
+ except Exception as e:
177
+ # print stack trace
178
+ import traceback
179
+
180
+ traceback.print_exc()
181
+ logger.warning(f"Can not initialize OpenAIServingResponses, error: {e}")
182
+
150
183
  if server_args.warmups is not None:
151
184
  await execute_warmups(
152
185
  server_args.disaggregation_mode,
@@ -843,6 +876,42 @@ async def v1_score_request(request: ScoringRequest, raw_request: Request):
843
876
  )
844
877
 
845
878
 
879
+ @app.post("/v1/responses", dependencies=[Depends(validate_json_request)])
880
+ async def v1_responses_request(request: dict, raw_request: Request):
881
+ """Endpoint for the responses API with reasoning support."""
882
+
883
+ request_obj = ResponsesRequest(**request)
884
+ result = await raw_request.app.state.openai_serving_responses.create_responses(
885
+ request_obj, raw_request
886
+ )
887
+
888
+ # Handle streaming responses
889
+ if isinstance(result, AsyncGenerator):
890
+ return StreamingResponse(
891
+ result,
892
+ media_type="text/event-stream",
893
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
894
+ )
895
+
896
+ return result
897
+
898
+
899
+ @app.get("/v1/responses/{response_id}")
900
+ async def v1_retrieve_responses(response_id: str, raw_request: Request):
901
+ """Retrieve a response by ID."""
902
+ return await raw_request.app.state.openai_serving_responses.retrieve_responses(
903
+ response_id
904
+ )
905
+
906
+
907
+ @app.post("/v1/responses/{response_id}/cancel")
908
+ async def v1_cancel_responses(response_id: str, raw_request: Request):
909
+ """Cancel a background response."""
910
+ return await raw_request.app.state.openai_serving_responses.cancel_responses(
911
+ response_id
912
+ )
913
+
914
+
846
915
  @app.api_route(
847
916
  "/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
848
917
  )
@@ -1103,6 +1172,8 @@ def _wait_and_warmup(
1103
1172
  pipe_finish_writer,
1104
1173
  ):
1105
1174
  return
1175
+ else:
1176
+ _global_state.tokenizer_manager.server_status = ServerStatus.Up
1106
1177
 
1107
1178
  logger.info("The server is fired up and ready to roll!")
1108
1179
 
@@ -14,9 +14,18 @@
14
14
  """Pydantic models for OpenAI API protocol"""
15
15
 
16
16
  import time
17
+ import uuid
17
18
  from dataclasses import dataclass
18
- from typing import Any, Dict, List, Optional, Union
19
+ from typing import Any, Dict, List, Optional, TypeAlias, Union
19
20
 
21
+ from openai.types.responses import (
22
+ ResponseFunctionToolCall,
23
+ ResponseInputItemParam,
24
+ ResponseOutputItem,
25
+ ResponseReasoningItem,
26
+ )
27
+ from openai.types.responses.response import ToolChoice
28
+ from openai.types.responses.tool import Tool
20
29
  from pydantic import (
21
30
  BaseModel,
22
31
  Field,
@@ -84,6 +93,7 @@ class UsageInfo(BaseModel):
84
93
  completion_tokens: Optional[int] = 0
85
94
  # only used to return cached tokens when --enable-cache-report is set
86
95
  prompt_tokens_details: Optional[Dict[str, int]] = None
96
+ reasoning_tokens: Optional[int] = 0
87
97
 
88
98
 
89
99
  class StreamOptions(BaseModel):
@@ -428,6 +438,13 @@ class ChatCompletionRequest(BaseModel):
428
438
  default="auto", examples=["none"]
429
439
  ) # noqa
430
440
  return_hidden_states: bool = False
441
+ reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field(
442
+ default="medium",
443
+ description="Constrains effort on reasoning for reasoning models. "
444
+ "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
445
+ "result in faster responses and fewer tokens used on reasoning in a response. "
446
+ "Currently only supported for OpenAI models.",
447
+ )
431
448
 
432
449
  @model_validator(mode="before")
433
450
  @classmethod
@@ -619,6 +636,196 @@ OpenAIServingRequest = Union[
619
636
  ]
620
637
 
621
638
 
639
+ # Response API protocol definitions
640
+ class ResponseReasoningParam(BaseModel):
641
+ """Reasoning parameters for responses."""
642
+
643
+ effort: Optional[Literal["low", "medium", "high"]] = Field(
644
+ default="medium",
645
+ description="Constrains effort on reasoning for reasoning models.",
646
+ )
647
+
648
+
649
+ class ResponseTool(BaseModel):
650
+ """Tool definition for responses."""
651
+
652
+ type: Literal["web_search_preview", "code_interpreter"] = Field(
653
+ description="Type of tool to enable"
654
+ )
655
+
656
+
657
+ ResponseInputOutputItem: TypeAlias = Union[
658
+ ResponseInputItemParam,
659
+ "ResponseReasoningItem",
660
+ ResponseFunctionToolCall,
661
+ ]
662
+
663
+
664
+ class ResponsesRequest(BaseModel):
665
+ """Request body for v1/responses endpoint."""
666
+
667
+ # Core OpenAI API fields (ordered by official documentation)
668
+ background: Optional[bool] = False
669
+ include: Optional[
670
+ List[
671
+ Literal[
672
+ "code_interpreter_call.outputs",
673
+ "computer_call_output.output.image_url",
674
+ "file_search_call.results",
675
+ "message.input_image.image_url",
676
+ "message.output_text.logprobs",
677
+ "reasoning.encrypted_content",
678
+ ]
679
+ ]
680
+ ] = None
681
+ input: Union[str, List[ResponseInputOutputItem]]
682
+ instructions: Optional[str] = None
683
+ max_output_tokens: Optional[int] = None
684
+ max_tool_calls: Optional[int] = None
685
+ metadata: Optional[Dict[str, Any]] = None
686
+ model: Optional[str] = None # Made optional to match vLLM
687
+ parallel_tool_calls: Optional[bool] = True
688
+ previous_response_id: Optional[str] = None
689
+ reasoning: Optional[ResponseReasoningParam] = None
690
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
691
+ store: Optional[bool] = True
692
+ stream: Optional[bool] = False
693
+ temperature: Optional[float] = None
694
+ tool_choice: Literal["auto", "required", "none"] = "auto"
695
+ tools: List[ResponseTool] = Field(default_factory=list)
696
+ top_logprobs: Optional[int] = 0
697
+ top_p: Optional[float] = None
698
+ truncation: Optional[Literal["auto", "disabled"]] = "disabled"
699
+ user: Optional[str] = None
700
+
701
+ # Extra SGLang parameters
702
+ request_id: str = Field(
703
+ default_factory=lambda: f"resp_{uuid.uuid4().hex}",
704
+ description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
705
+ )
706
+ priority: int = Field(default=0, description="Request priority")
707
+
708
+ # SGLang-specific sampling parameters
709
+ frequency_penalty: float = 0.0
710
+ presence_penalty: float = 0.0
711
+ stop: Optional[Union[str, List[str]]] = None
712
+ top_k: int = -1
713
+ min_p: float = 0.0
714
+ repetition_penalty: float = 1.0
715
+
716
+ # Default sampling parameters
717
+ _DEFAULT_SAMPLING_PARAMS = {
718
+ "temperature": 0.7,
719
+ "top_p": 1.0,
720
+ "top_k": -1,
721
+ "min_p": 0.0,
722
+ "repetition_penalty": 1.0,
723
+ }
724
+
725
+ def to_sampling_params(
726
+ self, default_max_tokens: int, default_params: Optional[Dict] = None
727
+ ) -> Dict[str, Any]:
728
+ """Convert to sampling parameters for generation."""
729
+ if default_params is None:
730
+ default_params = {}
731
+
732
+ # Use max_output_tokens if available, otherwise use max_tokens for backwards compatibility
733
+ if self.max_output_tokens is not None:
734
+ max_tokens = min(self.max_output_tokens, default_max_tokens)
735
+ else:
736
+ max_tokens = default_max_tokens
737
+
738
+ # Avoid exceed the context length by minus 1 token
739
+ max_tokens -= 1
740
+
741
+ # Get parameters with defaults
742
+ temperature = self.temperature
743
+ if temperature is None:
744
+ temperature = default_params.get(
745
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
746
+ )
747
+
748
+ top_p = self.top_p
749
+ if top_p is None:
750
+ top_p = default_params.get("top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
751
+
752
+ params = {
753
+ "max_new_tokens": max_tokens,
754
+ "temperature": temperature,
755
+ "top_p": top_p,
756
+ "frequency_penalty": self.frequency_penalty,
757
+ "presence_penalty": self.presence_penalty,
758
+ "stop": self.stop,
759
+ "top_k": self.top_k,
760
+ "min_p": self.min_p,
761
+ "repetition_penalty": self.repetition_penalty,
762
+ }
763
+
764
+ # Apply any additional default parameters
765
+ for key, value in default_params.items():
766
+ if key not in params or params[key] is None:
767
+ params[key] = value
768
+
769
+ return params
770
+
771
+
772
+ class PromptTokenUsageInfo(BaseModel):
773
+ """Prompt token usage details."""
774
+
775
+ cached_tokens: int = 0
776
+
777
+
778
+ class ResponsesResponse(BaseModel):
779
+ """Response body for v1/responses endpoint."""
780
+
781
+ id: str = Field(default_factory=lambda: f"resp_{time.time()}")
782
+ object: Literal["response"] = "response"
783
+ created_at: int = Field(default_factory=lambda: int(time.time()))
784
+ model: str
785
+
786
+ output: List[
787
+ Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
788
+ ] = Field(default_factory=list)
789
+ status: Literal["queued", "in_progress", "completed", "failed", "cancelled"]
790
+ usage: Optional[UsageInfo] = None
791
+ parallel_tool_calls: bool = True
792
+ tool_choice: str = "auto"
793
+ tools: List[ResponseTool] = Field(default_factory=list)
794
+
795
+ @classmethod
796
+ def from_request(
797
+ cls,
798
+ request: ResponsesRequest,
799
+ sampling_params: Any,
800
+ model_name: str,
801
+ created_time: int,
802
+ output: List[
803
+ Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
804
+ ],
805
+ status: str,
806
+ usage: Optional[UsageInfo],
807
+ ) -> "ResponsesResponse":
808
+ """Create a response from a request."""
809
+ return cls(
810
+ id=request.request_id,
811
+ created_at=created_time,
812
+ model=model_name,
813
+ output=output,
814
+ status=status,
815
+ usage=usage,
816
+ parallel_tool_calls=request.parallel_tool_calls or True,
817
+ tool_choice=request.tool_choice,
818
+ tools=request.tools,
819
+ )
820
+
821
+
822
+ class RequestResponseMetadata(BaseModel):
823
+ """Metadata for request/response tracking."""
824
+
825
+ request_id: str
826
+ final_usage_info: Optional[UsageInfo] = None
827
+
828
+
622
829
  @dataclass
623
830
  class MessageProcessingResult:
624
831
  """Result of processing chat messages and applying templates.
@@ -645,3 +852,22 @@ class MessageProcessingResult:
645
852
  modalities: List[str]
646
853
  stop: List[str]
647
854
  tool_call_constraint: Optional[Any] = None
855
+
856
+
857
+ class ResponseReasoningTextContent(BaseModel):
858
+ text: str
859
+ type: Literal["reasoning_text"] = "reasoning_text"
860
+
861
+
862
+ class ResponseReasoningItem(BaseModel):
863
+ id: str
864
+ content: list[ResponseReasoningTextContent] = Field(default_factory=list)
865
+ summary: list = Field(default_factory=list)
866
+ type: Literal["reasoning"] = "reasoning"
867
+ encrypted_content: Optional[str] = None
868
+ status: Optional[Literal["in_progress", "completed", "incomplete"]]
869
+
870
+
871
+ ResponseInputOutputItem: TypeAlias = Union[
872
+ ResponseInputItemParam, "ResponseReasoningItem", ResponseFunctionToolCall
873
+ ]