sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +79 -53
- sglang/bench_serving.py +186 -14
- sglang/profiler.py +0 -1
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +12 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/conversation.py +38 -5
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/launch_lb.py +0 -13
- sglang/srt/disaggregation/mini_lb.py +33 -8
- sglang/srt/disaggregation/prefill.py +1 -1
- sglang/srt/distributed/parallel_state.py +24 -14
- sglang/srt/entrypoints/engine.py +19 -12
- sglang/srt/entrypoints/http_server.py +174 -34
- sglang/srt/entrypoints/openai/protocol.py +87 -24
- sglang/srt/entrypoints/openai/serving_chat.py +50 -9
- sglang/srt/entrypoints/openai/serving_completions.py +15 -0
- sglang/srt/eplb/eplb_manager.py +26 -2
- sglang/srt/eplb/expert_distribution.py +29 -2
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/harmony_parser.py +588 -0
- sglang/srt/hf_transformers_utils.py +26 -7
- sglang/srt/layers/activation.py +12 -0
- sglang/srt/layers/attention/ascend_backend.py +374 -136
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
- sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
- sglang/srt/layers/communicator.py +1 -2
- sglang/srt/layers/layernorm.py +28 -3
- sglang/srt/layers/linear.py +3 -2
- sglang/srt/layers/logits_processor.py +1 -1
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +13 -13
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/topk.py +35 -12
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
- sglang/srt/layers/quantization/fp8.py +2 -1
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/modelopt_quant.py +7 -0
- sglang/srt/layers/quantization/mxfp4.py +25 -27
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w8a8_int8.py +7 -3
- sglang/srt/layers/rotary_embedding.py +28 -1
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/managers/cache_controller.py +237 -204
- sglang/srt/managers/detokenizer_manager.py +48 -2
- sglang/srt/managers/io_struct.py +57 -0
- sglang/srt/managers/mm_utils.py +5 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
- sglang/srt/managers/scheduler.py +94 -9
- sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/tokenizer_manager.py +122 -42
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +51 -23
- sglang/srt/mem_cache/hiradix_cache.py +87 -71
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +77 -14
- sglang/srt/mem_cache/memory_pool_host.py +4 -5
- sglang/srt/mem_cache/radix_cache.py +6 -4
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
- sglang/srt/mem_cache/swa_radix_cache.py +1 -1
- sglang/srt/model_executor/model_runner.py +6 -5
- sglang/srt/model_loader/loader.py +15 -24
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/models/deepseek_v2.py +38 -13
- sglang/srt/models/gpt_oss.py +2 -15
- sglang/srt/models/llama_eagle3.py +4 -0
- sglang/srt/models/longcat_flash.py +1015 -0
- sglang/srt/models/longcat_flash_nextn.py +691 -0
- sglang/srt/models/qwen2.py +26 -3
- sglang/srt/models/qwen2_5_vl.py +66 -41
- sglang/srt/models/qwen2_moe.py +22 -2
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/reasoning_parser.py +56 -300
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/server_args.py +122 -56
- sglang/srt/speculative/eagle_worker.py +28 -8
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +73 -5
- sglang/test/attention/test_trtllm_mla_backend.py +12 -3
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,7 @@ import json
|
|
23
23
|
import logging
|
24
24
|
import multiprocessing as multiprocessing
|
25
25
|
import os
|
26
|
+
import tempfile
|
26
27
|
import threading
|
27
28
|
import time
|
28
29
|
from http import HTTPStatus
|
@@ -91,11 +92,18 @@ from sglang.srt.managers.io_struct import (
|
|
91
92
|
UpdateWeightVersionReqInput,
|
92
93
|
VertexGenerateReqInput,
|
93
94
|
)
|
95
|
+
from sglang.srt.managers.multi_tokenizer_mixin import (
|
96
|
+
MultiTokenizerManager,
|
97
|
+
deserialize_data,
|
98
|
+
get_main_process_id,
|
99
|
+
read_from_shared_memory,
|
100
|
+
write_data_for_multi_tokenizer,
|
101
|
+
)
|
94
102
|
from sglang.srt.managers.template_manager import TemplateManager
|
95
103
|
from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
|
96
104
|
from sglang.srt.metrics.func_timer import enable_func_timer
|
97
105
|
from sglang.srt.reasoning_parser import ReasoningParser
|
98
|
-
from sglang.srt.server_args import ServerArgs
|
106
|
+
from sglang.srt.server_args import PortArgs, ServerArgs
|
99
107
|
from sglang.srt.utils import (
|
100
108
|
add_api_key_middleware,
|
101
109
|
add_prometheus_middleware,
|
@@ -130,8 +138,79 @@ def set_global_state(global_state: _GlobalState):
|
|
130
138
|
_global_state = global_state
|
131
139
|
|
132
140
|
|
141
|
+
# Function to set up all middlewares for multi-tokenizer compatibility
|
142
|
+
def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
|
143
|
+
"""Setup all middlewares for both single and multi-process modes"""
|
144
|
+
worker_pid = os.getpid()
|
145
|
+
|
146
|
+
if api_key:
|
147
|
+
add_api_key_middleware(app, api_key)
|
148
|
+
logger.info(f"Worker {worker_pid} added API key middleware")
|
149
|
+
|
150
|
+
if enable_metrics:
|
151
|
+
add_prometheus_middleware(app)
|
152
|
+
enable_func_timer()
|
153
|
+
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
154
|
+
|
155
|
+
|
156
|
+
async def init_multi_tokenizer() -> ServerArgs:
|
157
|
+
"""Read args information from shm and init tokenizer manager for current process"""
|
158
|
+
pid = os.getpid()
|
159
|
+
main_pid = get_main_process_id()
|
160
|
+
logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
|
161
|
+
|
162
|
+
# Read configuration from shared memory
|
163
|
+
port_args_data = read_from_shared_memory(f"port_args_{main_pid}")
|
164
|
+
server_args_data = read_from_shared_memory(f"server_args_{main_pid}")
|
165
|
+
scheduler_info_data = read_from_shared_memory(f"scheduler_info_{main_pid}")
|
166
|
+
port_args, server_args = deserialize_data(port_args_data, server_args_data)
|
167
|
+
scheduler_info = scheduler_info_data
|
168
|
+
|
169
|
+
port_args.tokenizer_ipc_name = (
|
170
|
+
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
171
|
+
)
|
172
|
+
|
173
|
+
# Launch multi-tokenizer manager process
|
174
|
+
tokenizer_manager = MultiTokenizerManager(server_args, port_args)
|
175
|
+
template_manager = TemplateManager()
|
176
|
+
template_manager.initialize_templates(
|
177
|
+
tokenizer_manager=tokenizer_manager,
|
178
|
+
model_path=server_args.model_path,
|
179
|
+
chat_template=server_args.chat_template,
|
180
|
+
completion_template=server_args.completion_template,
|
181
|
+
)
|
182
|
+
# Register this tokenizer with the main tokenizer manager
|
183
|
+
await tokenizer_manager.register_to_main_tokenizer_manager()
|
184
|
+
|
185
|
+
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
|
186
|
+
set_global_state(
|
187
|
+
_GlobalState(
|
188
|
+
tokenizer_manager=tokenizer_manager,
|
189
|
+
template_manager=template_manager,
|
190
|
+
scheduler_info=scheduler_info,
|
191
|
+
)
|
192
|
+
)
|
193
|
+
return server_args
|
194
|
+
|
195
|
+
|
133
196
|
@asynccontextmanager
|
134
197
|
async def lifespan(fast_api_app: FastAPI):
|
198
|
+
server_args = getattr(fast_api_app, "server_args", None)
|
199
|
+
if server_args is None:
|
200
|
+
# Initialize multi-tokenizer support for worker processes
|
201
|
+
fast_api_app.server_args = await init_multi_tokenizer()
|
202
|
+
setup_middlewares(
|
203
|
+
fast_api_app.server_args.api_key, fast_api_app.server_args.enable_metrics
|
204
|
+
)
|
205
|
+
fast_api_app.warmup_thread = threading.Thread(
|
206
|
+
target=_wait_and_warmup,
|
207
|
+
args=(
|
208
|
+
fast_api_app.server_args,
|
209
|
+
None, # pipe_finish_writer not needed in worker
|
210
|
+
None, # launch_callback not needed in worker
|
211
|
+
),
|
212
|
+
)
|
213
|
+
|
135
214
|
# Initialize OpenAI serving handlers
|
136
215
|
fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
|
137
216
|
_global_state.tokenizer_manager, _global_state.template_manager
|
@@ -191,7 +270,15 @@ async def lifespan(fast_api_app: FastAPI):
|
|
191
270
|
warmup_thread = getattr(fast_api_app, "warmup_thread", None)
|
192
271
|
if warmup_thread is not None:
|
193
272
|
warmup_thread.start()
|
194
|
-
|
273
|
+
|
274
|
+
try:
|
275
|
+
yield
|
276
|
+
finally:
|
277
|
+
if server_args.tokenizer_worker_num > 1:
|
278
|
+
pid = os.getpid()
|
279
|
+
logger.info(f"uvicorn worker {pid} ending...")
|
280
|
+
warmup_thread.join()
|
281
|
+
logger.info(f"uvicorn worker {pid} ended.")
|
195
282
|
|
196
283
|
|
197
284
|
# Fast API
|
@@ -480,6 +567,16 @@ async def flush_cache():
|
|
480
567
|
)
|
481
568
|
|
482
569
|
|
570
|
+
@app.api_route("/clear_hicache_storage_backend", methods=["GET", "POST"])
|
571
|
+
async def clear_hicache_storage_backend():
|
572
|
+
"""Clear the hierarchical cache storage backend."""
|
573
|
+
ret = await _global_state.tokenizer_manager.clear_hicache_storage()
|
574
|
+
return Response(
|
575
|
+
content="Hierarchical cache storage backend cleared.\n",
|
576
|
+
status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
|
577
|
+
)
|
578
|
+
|
579
|
+
|
483
580
|
@app.api_route("/start_profile", methods=["GET", "POST"])
|
484
581
|
async def start_profile_async(obj: Optional[ProfileReqInput] = None):
|
485
582
|
"""Start profiling."""
|
@@ -1068,9 +1165,19 @@ def launch_server(
|
|
1068
1165
|
1. The HTTP server, Engine, and TokenizerManager both run in the main process.
|
1069
1166
|
2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
|
1070
1167
|
"""
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1168
|
+
if server_args.tokenizer_worker_num > 1:
|
1169
|
+
port_args = PortArgs.init_new(server_args)
|
1170
|
+
port_args.tokenizer_worker_ipc_name = (
|
1171
|
+
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
1172
|
+
)
|
1173
|
+
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
1174
|
+
server_args=server_args, port_args=port_args
|
1175
|
+
)
|
1176
|
+
else:
|
1177
|
+
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
1178
|
+
server_args=server_args,
|
1179
|
+
)
|
1180
|
+
|
1074
1181
|
set_global_state(
|
1075
1182
|
_GlobalState(
|
1076
1183
|
tokenizer_manager=tokenizer_manager,
|
@@ -1079,42 +1186,75 @@ def launch_server(
|
|
1079
1186
|
)
|
1080
1187
|
)
|
1081
1188
|
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1189
|
+
if server_args.tokenizer_worker_num > 1:
|
1190
|
+
port_args_shm, server_args_shm, scheduler_info_shm = (
|
1191
|
+
write_data_for_multi_tokenizer(
|
1192
|
+
port_args,
|
1193
|
+
server_args,
|
1194
|
+
scheduler_info,
|
1195
|
+
)
|
1196
|
+
)
|
1197
|
+
else:
|
1198
|
+
# Add api key authorization
|
1199
|
+
if server_args.api_key:
|
1200
|
+
add_api_key_middleware(app, server_args.api_key)
|
1201
|
+
|
1202
|
+
# Add prometheus middleware
|
1203
|
+
if server_args.enable_metrics:
|
1204
|
+
add_prometheus_middleware(app)
|
1205
|
+
enable_func_timer()
|
1206
|
+
|
1207
|
+
# Send a warmup request - we will create the thread launch it
|
1208
|
+
# in the lifespan after all other warmups have fired.
|
1209
|
+
warmup_thread = threading.Thread(
|
1210
|
+
target=_wait_and_warmup,
|
1211
|
+
args=(
|
1212
|
+
server_args,
|
1213
|
+
pipe_finish_writer,
|
1214
|
+
launch_callback,
|
1215
|
+
),
|
1216
|
+
)
|
1217
|
+
app.warmup_thread = warmup_thread
|
1102
1218
|
|
1103
1219
|
try:
|
1104
1220
|
# Update logging configs
|
1105
1221
|
set_uvicorn_logging_configs()
|
1106
1222
|
app.server_args = server_args
|
1107
1223
|
# Listen for HTTP requests
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1224
|
+
if server_args.tokenizer_worker_num > 1:
|
1225
|
+
from uvicorn.config import LOGGING_CONFIG
|
1226
|
+
|
1227
|
+
LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
|
1228
|
+
"handlers": ["default"],
|
1229
|
+
"level": "INFO",
|
1230
|
+
"propagate": False,
|
1231
|
+
}
|
1232
|
+
uvicorn.run(
|
1233
|
+
"sglang.srt.entrypoints.http_server:app",
|
1234
|
+
host=server_args.host,
|
1235
|
+
port=server_args.port,
|
1236
|
+
log_level=server_args.log_level_http or server_args.log_level,
|
1237
|
+
timeout_keep_alive=5,
|
1238
|
+
loop="uvloop",
|
1239
|
+
workers=server_args.tokenizer_worker_num,
|
1240
|
+
)
|
1241
|
+
else:
|
1242
|
+
uvicorn.run(
|
1243
|
+
app,
|
1244
|
+
host=server_args.host,
|
1245
|
+
port=server_args.port,
|
1246
|
+
log_level=server_args.log_level_http or server_args.log_level,
|
1247
|
+
timeout_keep_alive=5,
|
1248
|
+
loop="uvloop",
|
1249
|
+
)
|
1116
1250
|
finally:
|
1117
|
-
|
1251
|
+
if server_args.tokenizer_worker_num > 1:
|
1252
|
+
port_args_shm.unlink()
|
1253
|
+
server_args_shm.unlink()
|
1254
|
+
scheduler_info_shm.unlink()
|
1255
|
+
_global_state.tokenizer_manager.clear_tokenizer_mapping()
|
1256
|
+
else:
|
1257
|
+
warmup_thread.join()
|
1118
1258
|
|
1119
1259
|
|
1120
1260
|
def _execute_server_warmup(
|
@@ -35,6 +35,8 @@ from pydantic import (
|
|
35
35
|
)
|
36
36
|
from typing_extensions import Literal
|
37
37
|
|
38
|
+
DEFAULT_MODEL_NAME = "default"
|
39
|
+
|
38
40
|
|
39
41
|
class ModelCard(BaseModel):
|
40
42
|
"""Model cards."""
|
@@ -108,6 +110,23 @@ class JsonSchemaResponseFormat(BaseModel):
|
|
108
110
|
strict: Optional[bool] = False
|
109
111
|
|
110
112
|
|
113
|
+
class ResponseFormat(BaseModel):
|
114
|
+
type: Literal["text", "json_object", "json_schema"]
|
115
|
+
json_schema: Optional[JsonSchemaResponseFormat] = None
|
116
|
+
|
117
|
+
|
118
|
+
class StructuresResponseFormat(BaseModel):
|
119
|
+
begin: str
|
120
|
+
schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
|
121
|
+
end: str
|
122
|
+
|
123
|
+
|
124
|
+
class StructuralTagResponseFormat(BaseModel):
|
125
|
+
type: Literal["structural_tag"]
|
126
|
+
structures: List[StructuresResponseFormat]
|
127
|
+
triggers: List[str]
|
128
|
+
|
129
|
+
|
111
130
|
class FileRequest(BaseModel):
|
112
131
|
# https://platform.openai.com/docs/api-reference/files/create
|
113
132
|
file: bytes # The File object (not file name) to be uploaded
|
@@ -166,7 +185,7 @@ class BatchResponse(BaseModel):
|
|
166
185
|
class CompletionRequest(BaseModel):
|
167
186
|
# Ordered by official OpenAI API documentation
|
168
187
|
# https://platform.openai.com/docs/api-reference/completions/create
|
169
|
-
model: str
|
188
|
+
model: str = DEFAULT_MODEL_NAME
|
170
189
|
prompt: Union[List[int], List[List[int]], str, List[str]]
|
171
190
|
best_of: Optional[int] = None
|
172
191
|
echo: bool = False
|
@@ -200,6 +219,7 @@ class CompletionRequest(BaseModel):
|
|
200
219
|
skip_special_tokens: bool = True
|
201
220
|
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
202
221
|
session_params: Optional[Dict] = None
|
222
|
+
response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
|
203
223
|
|
204
224
|
# For PD disaggregation
|
205
225
|
bootstrap_host: Optional[Union[List[str], str]] = None
|
@@ -327,7 +347,7 @@ class ToolCall(BaseModel):
|
|
327
347
|
|
328
348
|
|
329
349
|
class ChatCompletionMessageGenericParam(BaseModel):
|
330
|
-
role: Literal["system", "assistant", "tool"]
|
350
|
+
role: Literal["system", "assistant", "tool", "function"]
|
331
351
|
content: Union[str, List[ChatCompletionMessageContentTextPart], None] = Field(
|
332
352
|
default=None
|
333
353
|
)
|
@@ -341,9 +361,9 @@ class ChatCompletionMessageGenericParam(BaseModel):
|
|
341
361
|
def _normalize_role(cls, v):
|
342
362
|
if isinstance(v, str):
|
343
363
|
v_lower = v.lower()
|
344
|
-
if v_lower not in {"system", "assistant", "tool"}:
|
364
|
+
if v_lower not in {"system", "assistant", "tool", "function"}:
|
345
365
|
raise ValueError(
|
346
|
-
"'role' must be one of 'system', 'assistant', or '
|
366
|
+
"'role' must be one of 'system', 'assistant', 'tool', or 'function' (case-insensitive)."
|
347
367
|
)
|
348
368
|
return v_lower
|
349
369
|
raise ValueError("'role' must be a string")
|
@@ -359,23 +379,6 @@ ChatCompletionMessageParam = Union[
|
|
359
379
|
]
|
360
380
|
|
361
381
|
|
362
|
-
class ResponseFormat(BaseModel):
|
363
|
-
type: Literal["text", "json_object", "json_schema"]
|
364
|
-
json_schema: Optional[JsonSchemaResponseFormat] = None
|
365
|
-
|
366
|
-
|
367
|
-
class StructuresResponseFormat(BaseModel):
|
368
|
-
begin: str
|
369
|
-
schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
|
370
|
-
end: str
|
371
|
-
|
372
|
-
|
373
|
-
class StructuralTagResponseFormat(BaseModel):
|
374
|
-
type: Literal["structural_tag"]
|
375
|
-
structures: List[StructuresResponseFormat]
|
376
|
-
triggers: List[str]
|
377
|
-
|
378
|
-
|
379
382
|
class Function(BaseModel):
|
380
383
|
"""Function descriptions."""
|
381
384
|
|
@@ -409,7 +412,7 @@ class ChatCompletionRequest(BaseModel):
|
|
409
412
|
# Ordered by official OpenAI API documentation
|
410
413
|
# https://platform.openai.com/docs/api-reference/chat/create
|
411
414
|
messages: List[ChatCompletionMessageParam]
|
412
|
-
model: str
|
415
|
+
model: str = DEFAULT_MODEL_NAME
|
413
416
|
frequency_penalty: float = 0.0
|
414
417
|
logit_bias: Optional[Dict[str, float]] = None
|
415
418
|
logprobs: bool = False
|
@@ -457,6 +460,66 @@ class ChatCompletionRequest(BaseModel):
|
|
457
460
|
values["tool_choice"] = "auto"
|
458
461
|
return values
|
459
462
|
|
463
|
+
@model_validator(mode="before")
|
464
|
+
@classmethod
|
465
|
+
def normalize_reasoning_inputs(cls, values: Dict):
|
466
|
+
r = values.get("reasoning")
|
467
|
+
if r is None:
|
468
|
+
return values
|
469
|
+
|
470
|
+
if isinstance(r, dict):
|
471
|
+
effort = r.get("effort") or r.get("reasoning_effort")
|
472
|
+
if effort in {"low", "medium", "high"}:
|
473
|
+
values["reasoning_effort"] = effort
|
474
|
+
|
475
|
+
enabled = (
|
476
|
+
r.get("enabled")
|
477
|
+
if r.get("enabled") is not None
|
478
|
+
else r.get("enable", False)
|
479
|
+
)
|
480
|
+
if isinstance(enabled, str):
|
481
|
+
enabled = enabled.strip().lower() in {"1", "true", "yes", "y", "on"}
|
482
|
+
if enabled:
|
483
|
+
ctk = values.get("chat_template_kwargs")
|
484
|
+
if not isinstance(ctk, dict):
|
485
|
+
ctk = {}
|
486
|
+
ctk.setdefault("thinking", True)
|
487
|
+
values["chat_template_kwargs"] = ctk
|
488
|
+
|
489
|
+
return values
|
490
|
+
|
491
|
+
@model_validator(mode="before")
|
492
|
+
@classmethod
|
493
|
+
def set_json_schema(cls, values):
|
494
|
+
response_format = values.get("response_format")
|
495
|
+
if not response_format:
|
496
|
+
return values
|
497
|
+
|
498
|
+
if response_format.get("type") != "json_schema":
|
499
|
+
return values
|
500
|
+
|
501
|
+
schema = response_format.pop("schema", None)
|
502
|
+
json_schema = response_format.get("json_schema")
|
503
|
+
|
504
|
+
if json_schema:
|
505
|
+
return values
|
506
|
+
|
507
|
+
if schema:
|
508
|
+
name_ = schema.get("title", "Schema")
|
509
|
+
strict_ = False
|
510
|
+
if "properties" in schema and "strict" in schema["properties"]:
|
511
|
+
item = schema["properties"].pop("strict", None)
|
512
|
+
if item and item.get("default", False):
|
513
|
+
strict_ = True
|
514
|
+
|
515
|
+
response_format["json_schema"] = {
|
516
|
+
"name": name_,
|
517
|
+
"schema": schema,
|
518
|
+
"strict": strict_,
|
519
|
+
}
|
520
|
+
|
521
|
+
return values
|
522
|
+
|
460
523
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
461
524
|
top_k: int = -1
|
462
525
|
min_p: float = 0.0
|
@@ -571,7 +634,7 @@ class EmbeddingRequest(BaseModel):
|
|
571
634
|
# Ordered by official OpenAI API documentation
|
572
635
|
# https://platform.openai.com/docs/api-reference/embeddings/create
|
573
636
|
input: EmbeddingInput
|
574
|
-
model: str
|
637
|
+
model: str = DEFAULT_MODEL_NAME
|
575
638
|
encoding_format: str = "float"
|
576
639
|
dimensions: Optional[int] = None
|
577
640
|
user: Optional[str] = None
|
@@ -605,7 +668,7 @@ class ScoringRequest(BaseModel):
|
|
605
668
|
)
|
606
669
|
apply_softmax: bool = False
|
607
670
|
item_first: bool = False
|
608
|
-
model: str
|
671
|
+
model: str = DEFAULT_MODEL_NAME
|
609
672
|
|
610
673
|
|
611
674
|
class ScoringResponse(BaseModel):
|
@@ -148,6 +148,16 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
148
148
|
self, request: ChatCompletionRequest, is_multimodal: bool
|
149
149
|
) -> MessageProcessingResult:
|
150
150
|
"""Process chat messages and apply chat template"""
|
151
|
+
is_gpt_oss = (
|
152
|
+
hasattr(self.tokenizer_manager.model_config, "hf_config")
|
153
|
+
and hasattr(self.tokenizer_manager.model_config.hf_config, "model_type")
|
154
|
+
and self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
|
155
|
+
)
|
156
|
+
|
157
|
+
# GptOss model needs to keep special tokens for harmony parsing
|
158
|
+
if is_gpt_oss:
|
159
|
+
request.skip_special_tokens = False
|
160
|
+
|
151
161
|
tool_call_constraint = None
|
152
162
|
|
153
163
|
# Apply chat template and its stop strings
|
@@ -207,6 +217,25 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
207
217
|
audio_data,
|
208
218
|
modalities,
|
209
219
|
)
|
220
|
+
|
221
|
+
# per the Transformers docs & maintainers, tool call arguments in
|
222
|
+
# assistant-role messages with tool_calls need to be dicts not JSON str -
|
223
|
+
# this is how tool-use chat templates will expect them moving forwards
|
224
|
+
# so, for messages that have tool_calls, parse the string (which we get
|
225
|
+
# from openAI format) to dict
|
226
|
+
if (
|
227
|
+
processed_msg["role"] == "assistant"
|
228
|
+
and "tool_calls" in processed_msg
|
229
|
+
and isinstance(processed_msg["tool_calls"], list)
|
230
|
+
):
|
231
|
+
for item in processed_msg["tool_calls"]:
|
232
|
+
if "arguments" in item["function"] and isinstance(
|
233
|
+
item["function"]["arguments"], str
|
234
|
+
):
|
235
|
+
item["function"]["arguments"] = json.loads(
|
236
|
+
item["function"]["arguments"]
|
237
|
+
)
|
238
|
+
|
210
239
|
openai_compatible_messages.append(processed_msg)
|
211
240
|
|
212
241
|
# Handle assistant prefix for continue_final_message
|
@@ -806,15 +835,23 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
806
835
|
finish_reason["matched"] = None
|
807
836
|
try:
|
808
837
|
text, call_info_list = parser.parse_non_stream(text)
|
809
|
-
tool_calls = [
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
838
|
+
tool_calls = []
|
839
|
+
for call_info in call_info_list:
|
840
|
+
# For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
|
841
|
+
if tool_call_parser == "kimi_k2" and call_info.name is not None:
|
842
|
+
tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
|
843
|
+
else:
|
844
|
+
tool_id = f"call_{uuid.uuid4().hex[:24]}"
|
845
|
+
|
846
|
+
tool_calls.append(
|
847
|
+
ToolCall(
|
848
|
+
id=tool_id,
|
849
|
+
index=getattr(call_info, "tool_index", None),
|
850
|
+
function=FunctionResponse(
|
851
|
+
name=call_info.name, arguments=call_info.parameters
|
852
|
+
),
|
853
|
+
)
|
815
854
|
)
|
816
|
-
for call_info in call_info_list
|
817
|
-
]
|
818
855
|
return tool_calls, text, finish_reason
|
819
856
|
except Exception as e:
|
820
857
|
logger.error(f"Tool call parsing error: {e}")
|
@@ -925,7 +962,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
925
962
|
# Tool call ID should be generated only once per tool call
|
926
963
|
if call_item.name:
|
927
964
|
# First chunk: include ID and function name
|
928
|
-
|
965
|
+
if self.tokenizer_manager.server_args.tool_call_parser == "kimi_k2":
|
966
|
+
# Align with Kimi-K2 format: functions.{name}:{index}
|
967
|
+
tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
|
968
|
+
else:
|
969
|
+
tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
|
929
970
|
function_name = call_item.name
|
930
971
|
else:
|
931
972
|
# Subsequent chunks: null ID and name for argument deltas
|
@@ -23,6 +23,7 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
23
23
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
24
24
|
from sglang.srt.managers.template_manager import TemplateManager
|
25
25
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
26
|
+
from sglang.utils import convert_json_schema_to_str
|
26
27
|
|
27
28
|
logger = logging.getLogger(__name__)
|
28
29
|
|
@@ -125,6 +126,20 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
125
126
|
"logit_bias": request.logit_bias,
|
126
127
|
}
|
127
128
|
|
129
|
+
# Handle response_format constraints
|
130
|
+
if request.response_format and request.response_format.type == "json_schema":
|
131
|
+
sampling_params["json_schema"] = convert_json_schema_to_str(
|
132
|
+
request.response_format.json_schema.schema_
|
133
|
+
)
|
134
|
+
elif request.response_format and request.response_format.type == "json_object":
|
135
|
+
sampling_params["json_schema"] = '{"type": "object"}'
|
136
|
+
elif (
|
137
|
+
request.response_format and request.response_format.type == "structural_tag"
|
138
|
+
):
|
139
|
+
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
140
|
+
request.response_format.model_dump(by_alias=True)
|
141
|
+
)
|
142
|
+
|
128
143
|
return sampling_params
|
129
144
|
|
130
145
|
async def _handle_streaming_request(
|
sglang/srt/eplb/eplb_manager.py
CHANGED
@@ -58,9 +58,18 @@ class EPLBManager:
|
|
58
58
|
torch.cuda.synchronize()
|
59
59
|
time_start = time.time()
|
60
60
|
|
61
|
-
|
61
|
+
dump_record_output = get_global_expert_distribution_recorder().dump_record(
|
62
62
|
output_mode="object"
|
63
|
-
)
|
63
|
+
)
|
64
|
+
logical_count = dump_record_output["logical_count"]
|
65
|
+
average_utilization_rate_over_window = dump_record_output[
|
66
|
+
"average_utilization_rate_over_window"
|
67
|
+
]
|
68
|
+
|
69
|
+
# Check whether rebalancing is needed
|
70
|
+
if not self._check_rebalance_needed(average_utilization_rate_over_window):
|
71
|
+
return
|
72
|
+
|
64
73
|
expert_location_metadata = ExpertLocationMetadata.init_by_eplb(
|
65
74
|
self._server_args, self._model_runner.model_config, logical_count
|
66
75
|
)
|
@@ -81,6 +90,21 @@ class EPLBManager:
|
|
81
90
|
msg += f" time={time_end - time_start:.3f}s"
|
82
91
|
logger.info(msg)
|
83
92
|
|
93
|
+
def _check_rebalance_needed(self, average_utilization_rate_over_window):
|
94
|
+
if average_utilization_rate_over_window is None:
|
95
|
+
return True
|
96
|
+
|
97
|
+
if (
|
98
|
+
average_utilization_rate_over_window
|
99
|
+
> self._server_args.eplb_min_rebalancing_utilization_threshold
|
100
|
+
):
|
101
|
+
logger.info(
|
102
|
+
f"[EPLBManager] Skipped ep rebalancing: current GPU utilization {average_utilization_rate_over_window:.2f} > minimum rebalance threshold {self._server_args.eplb_min_rebalancing_utilization_threshold:.2f}"
|
103
|
+
)
|
104
|
+
return False
|
105
|
+
|
106
|
+
return True
|
107
|
+
|
84
108
|
def _compute_update_layer_ids_chunks(self) -> List[List[int]]:
|
85
109
|
all_layer_ids = sorted(
|
86
110
|
list(self._model_runner.model.routed_experts_weights_of_layer.keys())
|
@@ -12,6 +12,7 @@
|
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
14
|
import logging
|
15
|
+
import math
|
15
16
|
import os
|
16
17
|
import time
|
17
18
|
from abc import ABC
|
@@ -614,8 +615,8 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
|
|
614
615
|
self._enable = self._server_args.enable_expert_distribution_metrics
|
615
616
|
|
616
617
|
if self._enable:
|
617
|
-
window_sizes = [10, 100, 1000]
|
618
|
-
self._history = _DequeCollection(maxlens=window_sizes)
|
618
|
+
self.window_sizes = [10, 100, 1000]
|
619
|
+
self._history = _DequeCollection(maxlens=self.window_sizes)
|
619
620
|
self._rank = torch.distributed.get_rank()
|
620
621
|
|
621
622
|
def append(
|
@@ -787,6 +788,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
|
|
787
788
|
output = dict(
|
788
789
|
rank=self._rank,
|
789
790
|
logical_count=logical_count_of_buffered_step,
|
791
|
+
average_utilization_rate_over_window=self._get_global_average_utilization_rate(),
|
790
792
|
)
|
791
793
|
|
792
794
|
if output_mode == "file":
|
@@ -797,6 +799,31 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
|
|
797
799
|
else:
|
798
800
|
raise NotImplementedError
|
799
801
|
|
802
|
+
def _get_global_average_utilization_rate(self):
|
803
|
+
if not self._enable or math.isclose(
|
804
|
+
self._server_args.eplb_min_rebalancing_utilization_threshold, 1.0
|
805
|
+
):
|
806
|
+
return None
|
807
|
+
|
808
|
+
if self._rank == 0:
|
809
|
+
utilization_mean_rates = self._history.mean()
|
810
|
+
window_index = self.window_sizes[-1]
|
811
|
+
average_utilization_rate_over_window = (
|
812
|
+
utilization_mean_rates[window_index]
|
813
|
+
if window_index in utilization_mean_rates
|
814
|
+
else 0
|
815
|
+
)
|
816
|
+
|
817
|
+
avg_rate_tensor = torch.tensor(
|
818
|
+
[average_utilization_rate_over_window],
|
819
|
+
dtype=torch.float32,
|
820
|
+
device="cuda",
|
821
|
+
)
|
822
|
+
else:
|
823
|
+
avg_rate_tensor = torch.empty(1, dtype=torch.float32, device="cuda")
|
824
|
+
torch.distributed.broadcast(avg_rate_tensor, src=0)
|
825
|
+
return avg_rate_tensor.item()
|
826
|
+
|
800
827
|
|
801
828
|
def _dump_to_file(name, data):
|
802
829
|
save_dir = Path(os.environ.get("SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR", "/tmp"))
|