sglang 0.4.7.post1__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +8 -6
- sglang/srt/_custom_ops.py +2 -2
- sglang/srt/code_completion_parser.py +2 -44
- sglang/srt/constants.py +3 -0
- sglang/srt/conversation.py +13 -3
- sglang/srt/custom_op.py +5 -1
- sglang/srt/disaggregation/decode.py +22 -28
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
- sglang/srt/disaggregation/mini_lb.py +34 -4
- sglang/srt/disaggregation/mooncake/conn.py +12 -16
- sglang/srt/disaggregation/prefill.py +17 -13
- sglang/srt/disaggregation/utils.py +46 -18
- sglang/srt/distributed/parallel_state.py +12 -4
- sglang/srt/entrypoints/engine.py +22 -28
- sglang/srt/entrypoints/http_server.py +149 -79
- sglang/srt/entrypoints/http_server_engine.py +0 -3
- sglang/srt/entrypoints/openai/__init__.py +0 -0
- sglang/srt/{openai_api → entrypoints/openai}/protocol.py +67 -29
- sglang/srt/entrypoints/openai/serving_base.py +149 -0
- sglang/srt/entrypoints/openai/serving_chat.py +921 -0
- sglang/srt/entrypoints/openai/serving_completions.py +424 -0
- sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
- sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
- sglang/srt/entrypoints/openai/serving_score.py +61 -0
- sglang/srt/entrypoints/openai/usage_processor.py +81 -0
- sglang/srt/entrypoints/openai/utils.py +72 -0
- sglang/srt/function_call/base_format_detector.py +7 -4
- sglang/srt/function_call/deepseekv3_detector.py +1 -1
- sglang/srt/function_call/ebnf_composer.py +64 -10
- sglang/srt/function_call/function_call_parser.py +6 -6
- sglang/srt/function_call/llama32_detector.py +1 -1
- sglang/srt/function_call/mistral_detector.py +1 -1
- sglang/srt/function_call/pythonic_detector.py +1 -1
- sglang/srt/function_call/qwen25_detector.py +1 -1
- sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
- sglang/srt/layers/activation.py +21 -3
- sglang/srt/layers/attention/aiter_backend.py +5 -2
- sglang/srt/layers/attention/base_attn_backend.py +1 -1
- sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
- sglang/srt/layers/attention/flashattention_backend.py +19 -9
- sglang/srt/layers/attention/flashinfer_backend.py +9 -6
- sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
- sglang/srt/layers/attention/flashmla_backend.py +5 -2
- sglang/srt/layers/attention/tbo_backend.py +3 -3
- sglang/srt/layers/attention/triton_backend.py +19 -11
- sglang/srt/layers/communicator.py +5 -5
- sglang/srt/layers/dp_attention.py +11 -2
- sglang/srt/layers/layernorm.py +29 -2
- sglang/srt/layers/logits_processor.py +2 -2
- sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
- sglang/srt/layers/moe/ep_moe/layer.py +207 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +6 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
- sglang/srt/layers/moe/topk.py +91 -4
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
- sglang/srt/layers/quantization/fp8.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +62 -8
- sglang/srt/layers/quantization/utils.py +5 -2
- sglang/srt/layers/rotary_embedding.py +42 -2
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/lora/lora_manager.py +173 -74
- sglang/srt/lora/mem_pool.py +49 -45
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/cache_controller.py +33 -15
- sglang/srt/managers/io_struct.py +9 -12
- sglang/srt/managers/schedule_batch.py +40 -31
- sglang/srt/managers/schedule_policy.py +70 -56
- sglang/srt/managers/scheduler.py +147 -62
- sglang/srt/managers/template_manager.py +226 -0
- sglang/srt/managers/tokenizer_manager.py +11 -8
- sglang/srt/managers/tp_worker.py +12 -2
- sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
- sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
- sglang/srt/mem_cache/base_prefix_cache.py +52 -8
- sglang/srt/mem_cache/chunk_cache.py +11 -16
- sglang/srt/mem_cache/hiradix_cache.py +34 -23
- sglang/srt/mem_cache/memory_pool.py +118 -114
- sglang/srt/mem_cache/radix_cache.py +20 -16
- sglang/srt/model_executor/cuda_graph_runner.py +76 -45
- sglang/srt/model_executor/forward_batch_info.py +18 -5
- sglang/srt/model_executor/model_runner.py +22 -6
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/weight_utils.py +11 -2
- sglang/srt/models/deepseek_nextn.py +29 -27
- sglang/srt/models/deepseek_v2.py +108 -26
- sglang/srt/models/glm4.py +312 -0
- sglang/srt/models/mimo_mtp.py +2 -18
- sglang/srt/reasoning_parser.py +21 -11
- sglang/srt/server_args.py +36 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
- sglang/srt/speculative/eagle_utils.py +80 -8
- sglang/srt/speculative/eagle_worker.py +124 -41
- sglang/srt/torch_memory_saver_adapter.py +19 -15
- sglang/srt/utils.py +177 -11
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_utils.py +1 -0
- sglang/version.py +1 -1
- {sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/METADATA +4 -10
- {sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/RECORD +104 -93
- sglang/srt/entrypoints/verl_engine.py +0 -179
- sglang/srt/openai_api/adapter.py +0 -2148
- {sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
- {sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,13 @@
|
|
16
16
|
import time
|
17
17
|
from typing import Dict, List, Optional, Union
|
18
18
|
|
19
|
-
from pydantic import
|
19
|
+
from pydantic import (
|
20
|
+
BaseModel,
|
21
|
+
Field,
|
22
|
+
field_validator,
|
23
|
+
model_serializer,
|
24
|
+
model_validator,
|
25
|
+
)
|
20
26
|
from typing_extensions import Literal
|
21
27
|
|
22
28
|
|
@@ -167,6 +173,7 @@ class CompletionRequest(BaseModel):
|
|
167
173
|
temperature: float = 1.0
|
168
174
|
top_p: float = 1.0
|
169
175
|
user: Optional[str] = None
|
176
|
+
return_hidden_states: bool = False
|
170
177
|
|
171
178
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
172
179
|
top_k: int = -1
|
@@ -182,25 +189,34 @@ class CompletionRequest(BaseModel):
|
|
182
189
|
skip_special_tokens: bool = True
|
183
190
|
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
184
191
|
session_params: Optional[Dict] = None
|
185
|
-
return_hidden_states: Optional[bool] = False
|
186
192
|
|
187
193
|
# For PD disaggregation
|
188
194
|
bootstrap_host: Optional[str] = None
|
189
195
|
bootstrap_port: Optional[int] = None
|
190
196
|
bootstrap_room: Optional[int] = None
|
191
197
|
|
198
|
+
@field_validator("max_tokens")
|
199
|
+
@classmethod
|
200
|
+
def validate_max_tokens_positive(cls, v):
|
201
|
+
if v is not None and v <= 0:
|
202
|
+
raise ValueError("max_tokens must be positive")
|
203
|
+
return v
|
204
|
+
|
192
205
|
|
193
206
|
class CompletionResponseChoice(BaseModel):
|
194
207
|
index: int
|
195
208
|
text: str
|
196
209
|
logprobs: Optional[LogProbs] = None
|
197
|
-
finish_reason: Literal["stop", "length", "content_filter", "abort"]
|
210
|
+
finish_reason: Optional[Literal["stop", "length", "content_filter", "abort"]] = None
|
198
211
|
matched_stop: Union[None, int, str] = None
|
199
212
|
hidden_states: Optional[object] = None
|
200
213
|
|
201
|
-
@model_serializer
|
202
|
-
def _serialize(self):
|
203
|
-
|
214
|
+
@model_serializer(mode="wrap")
|
215
|
+
def _serialize(self, handler):
|
216
|
+
data = handler(self)
|
217
|
+
if self.hidden_states is None:
|
218
|
+
data.pop("hidden_states", None)
|
219
|
+
return data
|
204
220
|
|
205
221
|
|
206
222
|
class CompletionResponse(BaseModel):
|
@@ -220,9 +236,12 @@ class CompletionResponseStreamChoice(BaseModel):
|
|
220
236
|
matched_stop: Union[None, int, str] = None
|
221
237
|
hidden_states: Optional[object] = None
|
222
238
|
|
223
|
-
@model_serializer
|
224
|
-
def _serialize(self):
|
225
|
-
|
239
|
+
@model_serializer(mode="wrap")
|
240
|
+
def _serialize(self, handler):
|
241
|
+
data = handler(self)
|
242
|
+
if self.hidden_states is None:
|
243
|
+
data.pop("hidden_states", None)
|
244
|
+
return data
|
226
245
|
|
227
246
|
|
228
247
|
class CompletionStreamResponse(BaseModel):
|
@@ -380,8 +399,10 @@ class ChatCompletionRequest(BaseModel):
|
|
380
399
|
tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
|
381
400
|
default="auto", examples=["none"]
|
382
401
|
) # noqa
|
402
|
+
return_hidden_states: bool = False
|
383
403
|
|
384
|
-
@
|
404
|
+
@model_validator(mode="before")
|
405
|
+
@classmethod
|
385
406
|
def set_tool_choice_default(cls, values):
|
386
407
|
if values.get("tool_choice") is None:
|
387
408
|
if values.get("tools") is None:
|
@@ -416,9 +437,6 @@ class ChatCompletionRequest(BaseModel):
|
|
416
437
|
bootstrap_port: Optional[int] = None
|
417
438
|
bootstrap_room: Optional[int] = None
|
418
439
|
|
419
|
-
# Hidden States
|
420
|
-
return_hidden_states: Optional[bool] = False
|
421
|
-
|
422
440
|
|
423
441
|
class ChatMessage(BaseModel):
|
424
442
|
role: Optional[str] = None
|
@@ -431,15 +449,20 @@ class ChatCompletionResponseChoice(BaseModel):
|
|
431
449
|
index: int
|
432
450
|
message: ChatMessage
|
433
451
|
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
|
434
|
-
finish_reason:
|
435
|
-
|
436
|
-
|
452
|
+
finish_reason: Optional[
|
453
|
+
Literal[
|
454
|
+
"stop", "length", "tool_calls", "content_filter", "function_call", "abort"
|
455
|
+
]
|
456
|
+
] = None
|
437
457
|
matched_stop: Union[None, int, str] = None
|
438
458
|
hidden_states: Optional[object] = None
|
439
459
|
|
440
|
-
@model_serializer
|
441
|
-
def _serialize(self):
|
442
|
-
|
460
|
+
@model_serializer(mode="wrap")
|
461
|
+
def _serialize(self, handler):
|
462
|
+
data = handler(self)
|
463
|
+
if self.hidden_states is None:
|
464
|
+
data.pop("hidden_states", None)
|
465
|
+
return data
|
443
466
|
|
444
467
|
|
445
468
|
class ChatCompletionResponse(BaseModel):
|
@@ -458,9 +481,12 @@ class DeltaMessage(BaseModel):
|
|
458
481
|
tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
|
459
482
|
hidden_states: Optional[object] = None
|
460
483
|
|
461
|
-
@model_serializer
|
462
|
-
def _serialize(self):
|
463
|
-
|
484
|
+
@model_serializer(mode="wrap")
|
485
|
+
def _serialize(self, handler):
|
486
|
+
data = handler(self)
|
487
|
+
if self.hidden_states is None:
|
488
|
+
data.pop("hidden_states", None)
|
489
|
+
return data
|
464
490
|
|
465
491
|
|
466
492
|
class ChatCompletionResponseStreamChoice(BaseModel):
|
@@ -487,15 +513,18 @@ class MultimodalEmbeddingInput(BaseModel):
|
|
487
513
|
image: Optional[str] = None
|
488
514
|
|
489
515
|
|
516
|
+
EmbeddingInput = Union[
|
517
|
+
List[int], List[List[int]], str, List[str], List[MultimodalEmbeddingInput]
|
518
|
+
]
|
519
|
+
|
520
|
+
|
490
521
|
class EmbeddingRequest(BaseModel):
|
491
522
|
# Ordered by official OpenAI API documentation
|
492
523
|
# https://platform.openai.com/docs/api-reference/embeddings/create
|
493
|
-
input:
|
494
|
-
List[int], List[List[int]], str, List[str], List[MultimodalEmbeddingInput]
|
495
|
-
]
|
524
|
+
input: EmbeddingInput
|
496
525
|
model: str
|
497
526
|
encoding_format: str = "float"
|
498
|
-
dimensions: int = None
|
527
|
+
dimensions: Optional[int] = None
|
499
528
|
user: Optional[str] = None
|
500
529
|
|
501
530
|
# The request id.
|
@@ -539,6 +568,11 @@ class ScoringResponse(BaseModel):
|
|
539
568
|
object: str = "scoring"
|
540
569
|
|
541
570
|
|
571
|
+
class V1RerankReqInput(BaseModel):
|
572
|
+
query: str
|
573
|
+
documents: List[str]
|
574
|
+
|
575
|
+
|
542
576
|
class RerankResponse(BaseModel):
|
543
577
|
score: float
|
544
578
|
document: str
|
@@ -546,6 +580,10 @@ class RerankResponse(BaseModel):
|
|
546
580
|
meta_info: Optional[dict] = None
|
547
581
|
|
548
582
|
|
549
|
-
|
550
|
-
|
551
|
-
|
583
|
+
OpenAIServingRequest = Union[
|
584
|
+
ChatCompletionRequest,
|
585
|
+
CompletionRequest,
|
586
|
+
EmbeddingRequest,
|
587
|
+
ScoringRequest,
|
588
|
+
V1RerankReqInput,
|
589
|
+
]
|
@@ -0,0 +1,149 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import uuid
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from typing import Any, Optional, Union
|
6
|
+
|
7
|
+
from fastapi import Request
|
8
|
+
from fastapi.responses import ORJSONResponse, StreamingResponse
|
9
|
+
|
10
|
+
from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
|
11
|
+
from sglang.srt.managers.io_struct import GenerateReqInput
|
12
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
# Base class for specific endpoint handlers
|
18
|
+
class OpenAIServingBase(ABC):
|
19
|
+
"""Abstract base class for OpenAI endpoint handlers"""
|
20
|
+
|
21
|
+
def __init__(self, tokenizer_manager: TokenizerManager):
|
22
|
+
self.tokenizer_manager = tokenizer_manager
|
23
|
+
|
24
|
+
async def handle_request(
|
25
|
+
self, request: OpenAIServingRequest, raw_request: Request
|
26
|
+
) -> Union[Any, StreamingResponse, ErrorResponse]:
|
27
|
+
"""Handle the specific request type with common pattern"""
|
28
|
+
try:
|
29
|
+
# Validate request
|
30
|
+
error_msg = self._validate_request(request)
|
31
|
+
if error_msg:
|
32
|
+
return self.create_error_response(error_msg)
|
33
|
+
|
34
|
+
# Convert to internal format
|
35
|
+
adapted_request, processed_request = self._convert_to_internal_request(
|
36
|
+
request
|
37
|
+
)
|
38
|
+
|
39
|
+
# Note(Xinyuan): raw_request below is only used for detecting the connection of the client
|
40
|
+
if hasattr(request, "stream") and request.stream:
|
41
|
+
return await self._handle_streaming_request(
|
42
|
+
adapted_request, processed_request, raw_request
|
43
|
+
)
|
44
|
+
else:
|
45
|
+
return await self._handle_non_streaming_request(
|
46
|
+
adapted_request, processed_request, raw_request
|
47
|
+
)
|
48
|
+
|
49
|
+
except Exception as e:
|
50
|
+
logger.exception(f"Error in request: {e}")
|
51
|
+
return self.create_error_response(
|
52
|
+
message=f"Internal server error: {str(e)}",
|
53
|
+
err_type="InternalServerError",
|
54
|
+
status_code=500,
|
55
|
+
)
|
56
|
+
|
57
|
+
@abstractmethod
|
58
|
+
def _request_id_prefix(self) -> str:
|
59
|
+
"""Generate request ID based on request type"""
|
60
|
+
pass
|
61
|
+
|
62
|
+
def _generate_request_id_base(self, request: OpenAIServingRequest) -> Optional[str]:
|
63
|
+
"""Generate request ID based on request type"""
|
64
|
+
return None
|
65
|
+
|
66
|
+
# TODO(chang): the rid is used in io_strcut check and often violates `The rid should be a list` AssertionError
|
67
|
+
# Temporarily return None in this function until the rid logic is clear.
|
68
|
+
if rid := getattr(request, "rid", None):
|
69
|
+
return rid
|
70
|
+
|
71
|
+
return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
|
72
|
+
|
73
|
+
@abstractmethod
|
74
|
+
def _convert_to_internal_request(
|
75
|
+
self,
|
76
|
+
request: OpenAIServingRequest,
|
77
|
+
) -> tuple[GenerateReqInput, OpenAIServingRequest]:
|
78
|
+
"""Convert OpenAI request to internal format"""
|
79
|
+
pass
|
80
|
+
|
81
|
+
async def _handle_streaming_request(
|
82
|
+
self,
|
83
|
+
adapted_request: GenerateReqInput,
|
84
|
+
request: OpenAIServingRequest,
|
85
|
+
raw_request: Request,
|
86
|
+
) -> Union[StreamingResponse, ErrorResponse, ORJSONResponse]:
|
87
|
+
"""Handle streaming request
|
88
|
+
|
89
|
+
Override this method in child classes that support streaming requests.
|
90
|
+
"""
|
91
|
+
return self.create_error_response(
|
92
|
+
message=f"{self.__class__.__name__} does not support streaming requests",
|
93
|
+
err_type="NotImplementedError",
|
94
|
+
status_code=501,
|
95
|
+
)
|
96
|
+
|
97
|
+
async def _handle_non_streaming_request(
|
98
|
+
self,
|
99
|
+
adapted_request: GenerateReqInput,
|
100
|
+
request: OpenAIServingRequest,
|
101
|
+
raw_request: Request,
|
102
|
+
) -> Union[Any, ErrorResponse, ORJSONResponse]:
|
103
|
+
"""Handle non-streaming request
|
104
|
+
|
105
|
+
Override this method in child classes that support non-streaming requests.
|
106
|
+
"""
|
107
|
+
return self.create_error_response(
|
108
|
+
message=f"{self.__class__.__name__} does not support non-streaming requests",
|
109
|
+
err_type="NotImplementedError",
|
110
|
+
status_code=501,
|
111
|
+
)
|
112
|
+
|
113
|
+
def _validate_request(self, _: OpenAIServingRequest) -> Optional[str]:
|
114
|
+
"""Validate request"""
|
115
|
+
pass
|
116
|
+
|
117
|
+
def create_error_response(
|
118
|
+
self,
|
119
|
+
message: str,
|
120
|
+
err_type: str = "BadRequestError",
|
121
|
+
status_code: int = 400,
|
122
|
+
param: Optional[str] = None,
|
123
|
+
) -> ORJSONResponse:
|
124
|
+
"""Create an error response"""
|
125
|
+
# TODO: remove fastapi dependency in openai and move response handling to the entrypoint
|
126
|
+
error = ErrorResponse(
|
127
|
+
object="error",
|
128
|
+
message=message,
|
129
|
+
type=err_type,
|
130
|
+
param=param,
|
131
|
+
code=status_code,
|
132
|
+
)
|
133
|
+
return ORJSONResponse(content=error.model_dump(), status_code=status_code)
|
134
|
+
|
135
|
+
def create_streaming_error_response(
|
136
|
+
self,
|
137
|
+
message: str,
|
138
|
+
err_type: str = "BadRequestError",
|
139
|
+
status_code: int = 400,
|
140
|
+
) -> str:
|
141
|
+
"""Create a streaming error response"""
|
142
|
+
error = ErrorResponse(
|
143
|
+
object="error",
|
144
|
+
message=message,
|
145
|
+
type=err_type,
|
146
|
+
param=None,
|
147
|
+
code=status_code,
|
148
|
+
)
|
149
|
+
return json.dumps({"error": error.model_dump()})
|