sglang 0.4.7.post1__py3-none-any.whl → 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. sglang/bench_one_batch.py +8 -6
  2. sglang/srt/_custom_ops.py +2 -2
  3. sglang/srt/code_completion_parser.py +2 -44
  4. sglang/srt/constants.py +3 -0
  5. sglang/srt/conversation.py +13 -3
  6. sglang/srt/custom_op.py +5 -1
  7. sglang/srt/disaggregation/decode.py +22 -28
  8. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
  9. sglang/srt/disaggregation/mini_lb.py +34 -4
  10. sglang/srt/disaggregation/mooncake/conn.py +12 -16
  11. sglang/srt/disaggregation/prefill.py +17 -13
  12. sglang/srt/disaggregation/utils.py +46 -18
  13. sglang/srt/distributed/parallel_state.py +12 -4
  14. sglang/srt/entrypoints/engine.py +22 -28
  15. sglang/srt/entrypoints/http_server.py +149 -79
  16. sglang/srt/entrypoints/http_server_engine.py +0 -3
  17. sglang/srt/entrypoints/openai/__init__.py +0 -0
  18. sglang/srt/{openai_api → entrypoints/openai}/protocol.py +67 -29
  19. sglang/srt/entrypoints/openai/serving_base.py +149 -0
  20. sglang/srt/entrypoints/openai/serving_chat.py +921 -0
  21. sglang/srt/entrypoints/openai/serving_completions.py +424 -0
  22. sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
  23. sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
  24. sglang/srt/entrypoints/openai/serving_score.py +61 -0
  25. sglang/srt/entrypoints/openai/usage_processor.py +81 -0
  26. sglang/srt/entrypoints/openai/utils.py +72 -0
  27. sglang/srt/function_call/base_format_detector.py +7 -4
  28. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  29. sglang/srt/function_call/ebnf_composer.py +64 -10
  30. sglang/srt/function_call/function_call_parser.py +6 -6
  31. sglang/srt/function_call/llama32_detector.py +1 -1
  32. sglang/srt/function_call/mistral_detector.py +1 -1
  33. sglang/srt/function_call/pythonic_detector.py +1 -1
  34. sglang/srt/function_call/qwen25_detector.py +1 -1
  35. sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
  36. sglang/srt/layers/activation.py +21 -3
  37. sglang/srt/layers/attention/aiter_backend.py +5 -2
  38. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  39. sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
  40. sglang/srt/layers/attention/flashattention_backend.py +19 -9
  41. sglang/srt/layers/attention/flashinfer_backend.py +9 -6
  42. sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
  43. sglang/srt/layers/attention/flashmla_backend.py +5 -2
  44. sglang/srt/layers/attention/tbo_backend.py +3 -3
  45. sglang/srt/layers/attention/triton_backend.py +19 -11
  46. sglang/srt/layers/communicator.py +5 -5
  47. sglang/srt/layers/dp_attention.py +11 -2
  48. sglang/srt/layers/layernorm.py +29 -2
  49. sglang/srt/layers/logits_processor.py +2 -2
  50. sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
  51. sglang/srt/layers/moe/ep_moe/layer.py +207 -1
  52. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +6 -0
  54. sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
  55. sglang/srt/layers/moe/topk.py +91 -4
  56. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
  57. sglang/srt/layers/quantization/fp8.py +25 -17
  58. sglang/srt/layers/quantization/modelopt_quant.py +62 -8
  59. sglang/srt/layers/quantization/utils.py +5 -2
  60. sglang/srt/layers/rotary_embedding.py +42 -2
  61. sglang/srt/layers/sampler.py +1 -1
  62. sglang/srt/lora/lora_manager.py +173 -74
  63. sglang/srt/lora/mem_pool.py +49 -45
  64. sglang/srt/lora/utils.py +1 -1
  65. sglang/srt/managers/cache_controller.py +33 -15
  66. sglang/srt/managers/io_struct.py +9 -12
  67. sglang/srt/managers/schedule_batch.py +40 -31
  68. sglang/srt/managers/schedule_policy.py +70 -56
  69. sglang/srt/managers/scheduler.py +147 -62
  70. sglang/srt/managers/template_manager.py +226 -0
  71. sglang/srt/managers/tokenizer_manager.py +11 -8
  72. sglang/srt/managers/tp_worker.py +12 -2
  73. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  74. sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
  75. sglang/srt/mem_cache/base_prefix_cache.py +52 -8
  76. sglang/srt/mem_cache/chunk_cache.py +11 -16
  77. sglang/srt/mem_cache/hiradix_cache.py +34 -23
  78. sglang/srt/mem_cache/memory_pool.py +118 -114
  79. sglang/srt/mem_cache/radix_cache.py +20 -16
  80. sglang/srt/model_executor/cuda_graph_runner.py +76 -45
  81. sglang/srt/model_executor/forward_batch_info.py +18 -5
  82. sglang/srt/model_executor/model_runner.py +22 -6
  83. sglang/srt/model_loader/loader.py +8 -1
  84. sglang/srt/model_loader/weight_utils.py +11 -2
  85. sglang/srt/models/deepseek_nextn.py +29 -27
  86. sglang/srt/models/deepseek_v2.py +108 -26
  87. sglang/srt/models/glm4.py +312 -0
  88. sglang/srt/models/mimo_mtp.py +2 -18
  89. sglang/srt/reasoning_parser.py +21 -11
  90. sglang/srt/server_args.py +36 -8
  91. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
  92. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
  93. sglang/srt/speculative/eagle_utils.py +80 -8
  94. sglang/srt/speculative/eagle_worker.py +124 -41
  95. sglang/srt/torch_memory_saver_adapter.py +19 -15
  96. sglang/srt/utils.py +177 -11
  97. sglang/test/test_block_fp8_ep.py +1 -0
  98. sglang/test/test_utils.py +1 -0
  99. sglang/version.py +1 -1
  100. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/METADATA +4 -10
  101. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/RECORD +104 -93
  102. sglang/srt/entrypoints/verl_engine.py +0 -179
  103. sglang/srt/openai_api/adapter.py +0 -2148
  104. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
  105. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
  106. {sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,13 @@
16
16
  import time
17
17
  from typing import Dict, List, Optional, Union
18
18
 
19
- from pydantic import BaseModel, Field, model_serializer, root_validator
19
+ from pydantic import (
20
+ BaseModel,
21
+ Field,
22
+ field_validator,
23
+ model_serializer,
24
+ model_validator,
25
+ )
20
26
  from typing_extensions import Literal
21
27
 
22
28
 
@@ -167,6 +173,7 @@ class CompletionRequest(BaseModel):
167
173
  temperature: float = 1.0
168
174
  top_p: float = 1.0
169
175
  user: Optional[str] = None
176
+ return_hidden_states: bool = False
170
177
 
171
178
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
172
179
  top_k: int = -1
@@ -182,25 +189,34 @@ class CompletionRequest(BaseModel):
182
189
  skip_special_tokens: bool = True
183
190
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
184
191
  session_params: Optional[Dict] = None
185
- return_hidden_states: Optional[bool] = False
186
192
 
187
193
  # For PD disaggregation
188
194
  bootstrap_host: Optional[str] = None
189
195
  bootstrap_port: Optional[int] = None
190
196
  bootstrap_room: Optional[int] = None
191
197
 
198
+ @field_validator("max_tokens")
199
+ @classmethod
200
+ def validate_max_tokens_positive(cls, v):
201
+ if v is not None and v <= 0:
202
+ raise ValueError("max_tokens must be positive")
203
+ return v
204
+
192
205
 
193
206
  class CompletionResponseChoice(BaseModel):
194
207
  index: int
195
208
  text: str
196
209
  logprobs: Optional[LogProbs] = None
197
- finish_reason: Literal["stop", "length", "content_filter", "abort"]
210
+ finish_reason: Optional[Literal["stop", "length", "content_filter", "abort"]] = None
198
211
  matched_stop: Union[None, int, str] = None
199
212
  hidden_states: Optional[object] = None
200
213
 
201
- @model_serializer
202
- def _serialize(self):
203
- return exclude_if_none(self, ["hidden_states"])
214
+ @model_serializer(mode="wrap")
215
+ def _serialize(self, handler):
216
+ data = handler(self)
217
+ if self.hidden_states is None:
218
+ data.pop("hidden_states", None)
219
+ return data
204
220
 
205
221
 
206
222
  class CompletionResponse(BaseModel):
@@ -220,9 +236,12 @@ class CompletionResponseStreamChoice(BaseModel):
220
236
  matched_stop: Union[None, int, str] = None
221
237
  hidden_states: Optional[object] = None
222
238
 
223
- @model_serializer
224
- def _serialize(self):
225
- return exclude_if_none(self, ["hidden_states"])
239
+ @model_serializer(mode="wrap")
240
+ def _serialize(self, handler):
241
+ data = handler(self)
242
+ if self.hidden_states is None:
243
+ data.pop("hidden_states", None)
244
+ return data
226
245
 
227
246
 
228
247
  class CompletionStreamResponse(BaseModel):
@@ -380,8 +399,10 @@ class ChatCompletionRequest(BaseModel):
380
399
  tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
381
400
  default="auto", examples=["none"]
382
401
  ) # noqa
402
+ return_hidden_states: bool = False
383
403
 
384
- @root_validator(pre=True)
404
+ @model_validator(mode="before")
405
+ @classmethod
385
406
  def set_tool_choice_default(cls, values):
386
407
  if values.get("tool_choice") is None:
387
408
  if values.get("tools") is None:
@@ -416,9 +437,6 @@ class ChatCompletionRequest(BaseModel):
416
437
  bootstrap_port: Optional[int] = None
417
438
  bootstrap_room: Optional[int] = None
418
439
 
419
- # Hidden States
420
- return_hidden_states: Optional[bool] = False
421
-
422
440
 
423
441
  class ChatMessage(BaseModel):
424
442
  role: Optional[str] = None
@@ -431,15 +449,20 @@ class ChatCompletionResponseChoice(BaseModel):
431
449
  index: int
432
450
  message: ChatMessage
433
451
  logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
434
- finish_reason: Literal[
435
- "stop", "length", "tool_calls", "content_filter", "function_call", "abort"
436
- ]
452
+ finish_reason: Optional[
453
+ Literal[
454
+ "stop", "length", "tool_calls", "content_filter", "function_call", "abort"
455
+ ]
456
+ ] = None
437
457
  matched_stop: Union[None, int, str] = None
438
458
  hidden_states: Optional[object] = None
439
459
 
440
- @model_serializer
441
- def _serialize(self):
442
- return exclude_if_none(self, ["hidden_states"])
460
+ @model_serializer(mode="wrap")
461
+ def _serialize(self, handler):
462
+ data = handler(self)
463
+ if self.hidden_states is None:
464
+ data.pop("hidden_states", None)
465
+ return data
443
466
 
444
467
 
445
468
  class ChatCompletionResponse(BaseModel):
@@ -458,9 +481,12 @@ class DeltaMessage(BaseModel):
458
481
  tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
459
482
  hidden_states: Optional[object] = None
460
483
 
461
- @model_serializer
462
- def _serialize(self):
463
- return exclude_if_none(self, ["hidden_states"])
484
+ @model_serializer(mode="wrap")
485
+ def _serialize(self, handler):
486
+ data = handler(self)
487
+ if self.hidden_states is None:
488
+ data.pop("hidden_states", None)
489
+ return data
464
490
 
465
491
 
466
492
  class ChatCompletionResponseStreamChoice(BaseModel):
@@ -487,15 +513,18 @@ class MultimodalEmbeddingInput(BaseModel):
487
513
  image: Optional[str] = None
488
514
 
489
515
 
516
+ EmbeddingInput = Union[
517
+ List[int], List[List[int]], str, List[str], List[MultimodalEmbeddingInput]
518
+ ]
519
+
520
+
490
521
  class EmbeddingRequest(BaseModel):
491
522
  # Ordered by official OpenAI API documentation
492
523
  # https://platform.openai.com/docs/api-reference/embeddings/create
493
- input: Union[
494
- List[int], List[List[int]], str, List[str], List[MultimodalEmbeddingInput]
495
- ]
524
+ input: EmbeddingInput
496
525
  model: str
497
526
  encoding_format: str = "float"
498
- dimensions: int = None
527
+ dimensions: Optional[int] = None
499
528
  user: Optional[str] = None
500
529
 
501
530
  # The request id.
@@ -539,6 +568,11 @@ class ScoringResponse(BaseModel):
539
568
  object: str = "scoring"
540
569
 
541
570
 
571
+ class V1RerankReqInput(BaseModel):
572
+ query: str
573
+ documents: List[str]
574
+
575
+
542
576
  class RerankResponse(BaseModel):
543
577
  score: float
544
578
  document: str
@@ -546,6 +580,10 @@ class RerankResponse(BaseModel):
546
580
  meta_info: Optional[dict] = None
547
581
 
548
582
 
549
- def exclude_if_none(obj, field_names: List[str]):
550
- omit_if_none_fields = {k for k, v in obj.model_fields.items() if k in field_names}
551
- return {k: v for k, v in obj if k not in omit_if_none_fields or v is not None}
583
+ OpenAIServingRequest = Union[
584
+ ChatCompletionRequest,
585
+ CompletionRequest,
586
+ EmbeddingRequest,
587
+ ScoringRequest,
588
+ V1RerankReqInput,
589
+ ]
@@ -0,0 +1,149 @@
1
+ import json
2
+ import logging
3
+ import uuid
4
+ from abc import ABC, abstractmethod
5
+ from typing import Any, Optional, Union
6
+
7
+ from fastapi import Request
8
+ from fastapi.responses import ORJSONResponse, StreamingResponse
9
+
10
+ from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
11
+ from sglang.srt.managers.io_struct import GenerateReqInput
12
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ # Base class for specific endpoint handlers
18
+ class OpenAIServingBase(ABC):
19
+ """Abstract base class for OpenAI endpoint handlers"""
20
+
21
+ def __init__(self, tokenizer_manager: TokenizerManager):
22
+ self.tokenizer_manager = tokenizer_manager
23
+
24
+ async def handle_request(
25
+ self, request: OpenAIServingRequest, raw_request: Request
26
+ ) -> Union[Any, StreamingResponse, ErrorResponse]:
27
+ """Handle the specific request type with common pattern"""
28
+ try:
29
+ # Validate request
30
+ error_msg = self._validate_request(request)
31
+ if error_msg:
32
+ return self.create_error_response(error_msg)
33
+
34
+ # Convert to internal format
35
+ adapted_request, processed_request = self._convert_to_internal_request(
36
+ request
37
+ )
38
+
39
+ # Note(Xinyuan): raw_request below is only used for detecting the connection of the client
40
+ if hasattr(request, "stream") and request.stream:
41
+ return await self._handle_streaming_request(
42
+ adapted_request, processed_request, raw_request
43
+ )
44
+ else:
45
+ return await self._handle_non_streaming_request(
46
+ adapted_request, processed_request, raw_request
47
+ )
48
+
49
+ except Exception as e:
50
+ logger.exception(f"Error in request: {e}")
51
+ return self.create_error_response(
52
+ message=f"Internal server error: {str(e)}",
53
+ err_type="InternalServerError",
54
+ status_code=500,
55
+ )
56
+
57
+ @abstractmethod
58
+ def _request_id_prefix(self) -> str:
59
+ """Generate request ID based on request type"""
60
+ pass
61
+
62
+ def _generate_request_id_base(self, request: OpenAIServingRequest) -> Optional[str]:
63
+ """Generate request ID based on request type"""
64
+ return None
65
+
66
+ # TODO(chang): the rid is used in io_strcut check and often violates `The rid should be a list` AssertionError
67
+ # Temporarily return None in this function until the rid logic is clear.
68
+ if rid := getattr(request, "rid", None):
69
+ return rid
70
+
71
+ return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
72
+
73
+ @abstractmethod
74
+ def _convert_to_internal_request(
75
+ self,
76
+ request: OpenAIServingRequest,
77
+ ) -> tuple[GenerateReqInput, OpenAIServingRequest]:
78
+ """Convert OpenAI request to internal format"""
79
+ pass
80
+
81
+ async def _handle_streaming_request(
82
+ self,
83
+ adapted_request: GenerateReqInput,
84
+ request: OpenAIServingRequest,
85
+ raw_request: Request,
86
+ ) -> Union[StreamingResponse, ErrorResponse, ORJSONResponse]:
87
+ """Handle streaming request
88
+
89
+ Override this method in child classes that support streaming requests.
90
+ """
91
+ return self.create_error_response(
92
+ message=f"{self.__class__.__name__} does not support streaming requests",
93
+ err_type="NotImplementedError",
94
+ status_code=501,
95
+ )
96
+
97
+ async def _handle_non_streaming_request(
98
+ self,
99
+ adapted_request: GenerateReqInput,
100
+ request: OpenAIServingRequest,
101
+ raw_request: Request,
102
+ ) -> Union[Any, ErrorResponse, ORJSONResponse]:
103
+ """Handle non-streaming request
104
+
105
+ Override this method in child classes that support non-streaming requests.
106
+ """
107
+ return self.create_error_response(
108
+ message=f"{self.__class__.__name__} does not support non-streaming requests",
109
+ err_type="NotImplementedError",
110
+ status_code=501,
111
+ )
112
+
113
+ def _validate_request(self, _: OpenAIServingRequest) -> Optional[str]:
114
+ """Validate request"""
115
+ pass
116
+
117
+ def create_error_response(
118
+ self,
119
+ message: str,
120
+ err_type: str = "BadRequestError",
121
+ status_code: int = 400,
122
+ param: Optional[str] = None,
123
+ ) -> ORJSONResponse:
124
+ """Create an error response"""
125
+ # TODO: remove fastapi dependency in openai and move response handling to the entrypoint
126
+ error = ErrorResponse(
127
+ object="error",
128
+ message=message,
129
+ type=err_type,
130
+ param=param,
131
+ code=status_code,
132
+ )
133
+ return ORJSONResponse(content=error.model_dump(), status_code=status_code)
134
+
135
+ def create_streaming_error_response(
136
+ self,
137
+ message: str,
138
+ err_type: str = "BadRequestError",
139
+ status_code: int = 400,
140
+ ) -> str:
141
+ """Create a streaming error response"""
142
+ error = ErrorResponse(
143
+ object="error",
144
+ message=message,
145
+ type=err_type,
146
+ param=None,
147
+ code=status_code,
148
+ )
149
+ return json.dumps({"error": error.model_dump()})