sglang 0.5.1.post2__py3-none-any.whl → 0.5.1.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +79 -53
- sglang/bench_serving.py +186 -14
- sglang/profiler.py +0 -1
- sglang/srt/conversation.py +38 -5
- sglang/srt/entrypoints/engine.py +1 -1
- sglang/srt/entrypoints/openai/protocol.py +27 -24
- sglang/srt/entrypoints/openai/serving_chat.py +50 -9
- sglang/srt/entrypoints/openai/serving_completions.py +15 -0
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/harmony_parser.py +588 -0
- sglang/srt/hf_transformers_utils.py +16 -7
- sglang/srt/layers/attention/ascend_backend.py +218 -111
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
- sglang/srt/layers/communicator.py +1 -2
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/ep_moe/layer.py +1 -7
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/topk.py +1 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -7
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
- sglang/srt/layers/quantization/fp8.py +2 -1
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/mxfp4.py +16 -23
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/managers/cache_controller.py +223 -156
- sglang/srt/managers/detokenizer_manager.py +5 -0
- sglang/srt/managers/io_struct.py +30 -0
- sglang/srt/managers/scheduler.py +58 -7
- sglang/srt/managers/tokenizer_manager.py +36 -3
- sglang/srt/mem_cache/hicache_storage.py +31 -20
- sglang/srt/mem_cache/hiradix_cache.py +12 -3
- sglang/srt/mem_cache/memory_pool.py +73 -14
- sglang/srt/mem_cache/memory_pool_host.py +3 -2
- sglang/srt/mem_cache/radix_cache.py +1 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +5 -13
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +85 -81
- sglang/srt/model_executor/model_runner.py +1 -1
- sglang/srt/models/deepseek_v2.py +12 -3
- sglang/srt/models/gpt_oss.py +2 -1
- sglang/srt/models/qwen2_5_vl.py +1 -0
- sglang/srt/reasoning_parser.py +56 -300
- sglang/srt/server_args.py +10 -1
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +59 -5
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/METADATA +4 -3
- {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/RECORD +57 -54
- {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/top_level.txt +0 -0
sglang/srt/reasoning_parser.py
CHANGED
@@ -1,13 +1,19 @@
|
|
1
1
|
import re
|
2
2
|
from typing import Dict, Optional, Tuple, Type
|
3
3
|
|
4
|
+
from sglang.srt.harmony_parser import HarmonyParser
|
5
|
+
|
4
6
|
|
5
7
|
class StreamingParseResult:
|
6
8
|
"""Result of streaming incremental parsing."""
|
7
9
|
|
8
|
-
def __init__(
|
9
|
-
self
|
10
|
-
|
10
|
+
def __init__(
|
11
|
+
self,
|
12
|
+
normal_text: Optional[str] = None,
|
13
|
+
reasoning_text: Optional[str] = None,
|
14
|
+
):
|
15
|
+
self.normal_text = normal_text or ""
|
16
|
+
self.reasoning_text = reasoning_text or ""
|
11
17
|
|
12
18
|
|
13
19
|
class BaseReasoningFormatDetector:
|
@@ -188,316 +194,60 @@ class KimiDetector(BaseReasoningFormatDetector):
|
|
188
194
|
|
189
195
|
class GptOssDetector(BaseReasoningFormatDetector):
|
190
196
|
"""
|
191
|
-
Detector for T4-style reasoning format.
|
192
|
-
|
193
|
-
Assumes reasoning format with two channels:
|
194
|
-
<|channel|>analysis<|message|>...reasoning content...<|end|>
|
195
|
-
<|start|>assistant<|channel|>final<|message|>...final answer...<|return|>
|
196
|
-
|
197
|
-
Returns content from 'analysis' channel as reasoning_text
|
198
|
-
and content from 'final' channel as normal_text.
|
199
|
-
|
200
|
-
Args:
|
201
|
-
stream_reasoning (bool): If False, accumulates reasoning content until complete.
|
202
|
-
If True, streams reasoning content as it arrives.
|
197
|
+
Detector for T4-style reasoning format (GPT-OSS), using the HarmonyParser.
|
203
198
|
"""
|
204
199
|
|
205
200
|
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
|
206
|
-
# TypeScript uses channel tokens instead of simple start/end tokens
|
207
201
|
super().__init__(
|
208
202
|
"<|channel|>analysis<|message|>",
|
209
203
|
"<|end|>",
|
210
|
-
force_reasoning=
|
204
|
+
force_reasoning=force_reasoning,
|
211
205
|
stream_reasoning=stream_reasoning,
|
212
206
|
)
|
213
|
-
self.
|
214
|
-
self.final_channel_end = "<|return|>"
|
215
|
-
self._in_final_channel = False
|
216
|
-
self._analysis_complete = False
|
217
|
-
self._in_reasoning = True
|
207
|
+
self.parser = HarmonyParser()
|
218
208
|
|
219
209
|
def detect_and_parse(self, text: str) -> StreamingParseResult:
|
220
|
-
|
221
|
-
|
222
|
-
|
210
|
+
events = self.parser.parse(text)
|
211
|
+
# Flush the buffer for one-shot parsing
|
212
|
+
events += self.parser.parse("")
|
223
213
|
|
224
|
-
|
225
|
-
|
226
|
-
"""
|
227
|
-
# HACK: Handle simplified format (analysis...assistantfinal) without channel markers
|
228
|
-
if (
|
229
|
-
text.startswith("analysis")
|
230
|
-
and "assistantfinal" in text
|
231
|
-
and "<|channel|>" not in text
|
232
|
-
):
|
233
|
-
# Split on "assistantfinal"
|
234
|
-
parts = text.split("assistantfinal", 1)
|
235
|
-
self._in_reasoning = False
|
236
|
-
if len(parts) == 2:
|
237
|
-
reasoning_text = parts[0][
|
238
|
-
len("analysis") :
|
239
|
-
].strip() # Remove "analysis" prefix
|
240
|
-
normal_text = parts[1].strip()
|
241
|
-
return StreamingParseResult(
|
242
|
-
normal_text=normal_text, reasoning_text=reasoning_text
|
243
|
-
)
|
244
|
-
|
245
|
-
reasoning_parts = []
|
246
|
-
normal_parts = []
|
247
|
-
current_pos = 0
|
248
|
-
|
249
|
-
# Process text sequentially to preserve tool calls between analysis sections
|
250
|
-
while current_pos < len(text):
|
251
|
-
# Look for next analysis channel
|
252
|
-
analysis_start_idx = text.find(self.think_start_token, current_pos)
|
253
|
-
|
254
|
-
if analysis_start_idx == -1:
|
255
|
-
# No more analysis channels, rest goes to remaining
|
256
|
-
break
|
257
|
-
|
258
|
-
# Preserve any content before this analysis channel (could include tool calls)
|
259
|
-
if analysis_start_idx > current_pos:
|
260
|
-
between_content = text[current_pos:analysis_start_idx]
|
261
|
-
# This content will be added to normal_parts later
|
262
|
-
normal_parts.append(between_content)
|
263
|
-
|
264
|
-
# Extract analysis content
|
265
|
-
analysis_content_start = analysis_start_idx + len(self.think_start_token)
|
266
|
-
analysis_end_idx = text.find(self.think_end_token, analysis_content_start)
|
267
|
-
|
268
|
-
if analysis_end_idx != -1:
|
269
|
-
reasoning_parts.append(
|
270
|
-
text[analysis_content_start:analysis_end_idx].strip()
|
271
|
-
)
|
272
|
-
current_pos = analysis_end_idx + len(self.think_end_token)
|
273
|
-
else:
|
274
|
-
# Analysis not complete
|
275
|
-
reasoning_parts.append(text[analysis_content_start:].strip())
|
276
|
-
reasoning_text = "".join(reasoning_parts)
|
277
|
-
return StreamingParseResult(reasoning_text=reasoning_text)
|
278
|
-
|
279
|
-
# Add any remaining text after all analysis sections
|
280
|
-
if current_pos < len(text):
|
281
|
-
remaining = text[current_pos:]
|
282
|
-
normal_parts.append(remaining)
|
283
|
-
|
284
|
-
# Process non-analysis content for commentary sections
|
285
|
-
full_normal_text = "".join(normal_parts)
|
286
|
-
|
287
|
-
# Extract reasoning from non-tool-call commentary sections
|
288
|
-
# Tool calls have "to=" in their header, regular commentary does not
|
289
|
-
commentary_pattern = re.compile(
|
290
|
-
r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
|
291
|
-
re.DOTALL,
|
214
|
+
reasoning_text = "".join(
|
215
|
+
[e.content for e in events if e.event_type == "reasoning"]
|
292
216
|
)
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
commentary_idx = match_text.find("<|channel|>commentary")
|
303
|
-
if commentary_idx != -1:
|
304
|
-
message_idx = match_text.find("<|message|>", commentary_idx)
|
305
|
-
if message_idx != -1:
|
306
|
-
between_text = match_text[commentary_idx:message_idx]
|
307
|
-
# If no "to=" found, this is regular commentary (reasoning content)
|
308
|
-
if " to=" not in between_text:
|
309
|
-
content = match.group(1).strip()
|
310
|
-
reasoning_parts.append(content)
|
311
|
-
# Remove this commentary section from normal text
|
312
|
-
cleaned_text = (
|
313
|
-
cleaned_text[: match.start()] + cleaned_text[match.end() :]
|
314
|
-
)
|
315
|
-
|
316
|
-
full_normal_text = cleaned_text
|
317
|
-
|
318
|
-
# Combine all reasoning parts
|
319
|
-
reasoning_text = "".join(reasoning_parts)
|
320
|
-
|
321
|
-
# Process full_normal_text for final output
|
322
|
-
normal_text = ""
|
323
|
-
if self.final_channel_start in full_normal_text:
|
324
|
-
final_start = full_normal_text.find(self.final_channel_start)
|
325
|
-
final_content_start = final_start + len(self.final_channel_start)
|
326
|
-
final_end = full_normal_text.find(
|
327
|
-
self.final_channel_end, final_content_start
|
328
|
-
)
|
329
|
-
|
330
|
-
if final_end != -1:
|
331
|
-
# Extract content before final channel (includes tool calls)
|
332
|
-
before_final = full_normal_text[:final_start].strip()
|
333
|
-
# Extract ONLY the final channel content (not the channel markers)
|
334
|
-
final_text = full_normal_text[final_content_start:final_end].strip()
|
335
|
-
# Extract content after final channel
|
336
|
-
after_final = full_normal_text[
|
337
|
-
final_end + len(self.final_channel_end) :
|
338
|
-
].strip()
|
339
|
-
|
340
|
-
# For tool calls + final answer: concatenate tool calls with final text
|
341
|
-
parts = []
|
342
|
-
if before_final:
|
343
|
-
parts.append(before_final)
|
344
|
-
if final_text:
|
345
|
-
parts.append(final_text)
|
346
|
-
if after_final:
|
347
|
-
parts.append(after_final)
|
348
|
-
normal_text = " ".join(parts)
|
349
|
-
else:
|
350
|
-
# Final channel not complete - extract what we have
|
351
|
-
# Look for just <|channel|>final<|message|> without <|return|>
|
352
|
-
alt_final_start = full_normal_text.find("<|channel|>final<|message|>")
|
353
|
-
if alt_final_start != -1:
|
354
|
-
before_alt_final = full_normal_text[:alt_final_start].strip()
|
355
|
-
alt_final_content = full_normal_text[
|
356
|
-
alt_final_start + len("<|channel|>final<|message|>") :
|
357
|
-
].strip()
|
358
|
-
|
359
|
-
parts = []
|
360
|
-
if before_alt_final:
|
361
|
-
parts.append(before_alt_final)
|
362
|
-
if alt_final_content:
|
363
|
-
parts.append(alt_final_content)
|
364
|
-
normal_text = " ".join(parts)
|
365
|
-
else:
|
366
|
-
normal_text = full_normal_text.strip()
|
367
|
-
else:
|
368
|
-
# No final channel, treat all as normal text (includes tool calls)
|
369
|
-
normal_text = full_normal_text.strip()
|
217
|
+
normal_parts = []
|
218
|
+
for e in events:
|
219
|
+
if e.event_type == "normal":
|
220
|
+
normal_parts.append(e.content)
|
221
|
+
elif e.event_type == "tool_call":
|
222
|
+
# Use raw_text to preserve structural markers for function call detector
|
223
|
+
normal_parts.append(e.raw_text if e.raw_text else e.content)
|
224
|
+
normal_text = "".join(normal_parts)
|
225
|
+
# Tool call events preserve raw text with structural markers
|
370
226
|
|
371
227
|
return StreamingParseResult(
|
372
|
-
normal_text=normal_text,
|
228
|
+
normal_text=normal_text,
|
229
|
+
reasoning_text=reasoning_text,
|
373
230
|
)
|
374
231
|
|
375
232
|
def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
|
376
|
-
|
377
|
-
Streaming incremental parsing for GPT-OSS format.
|
233
|
+
events = self.parser.parse(new_text)
|
378
234
|
|
379
|
-
|
380
|
-
|
381
|
-
TODO: Implement proper incremental parsing for better streaming performance.
|
382
|
-
"""
|
383
|
-
self._buffer += new_text
|
384
|
-
|
385
|
-
if not self._in_reasoning:
|
386
|
-
return StreamingParseResult(normal_text=new_text)
|
387
|
-
|
388
|
-
# Check if we have complete sections to process
|
389
|
-
# For GPT-OSS, we need to wait for complete channel sections
|
390
|
-
# HACK: For now, use simplified approach - wait for key markers before processing
|
391
|
-
key_markers = ["<|end|>", "<|call|>", "<|return|>", "assistantfinal"]
|
392
|
-
has_complete_section = any(marker in self._buffer for marker in key_markers)
|
393
|
-
|
394
|
-
if not has_complete_section:
|
395
|
-
# Still accumulating, don't process yet
|
396
|
-
return StreamingParseResult()
|
397
|
-
|
398
|
-
# Handle simplified format (analysis...assistantfinal) with true incremental streaming
|
399
|
-
if (
|
400
|
-
"<|channel|>" not in self._buffer
|
401
|
-
): # Simplified format without channel markers
|
402
|
-
if self._buffer.startswith("analysis"):
|
403
|
-
# Check if we have the transition to assistantfinal
|
404
|
-
if "assistantfinal" in self._buffer:
|
405
|
-
self._in_reasoning = False
|
406
|
-
# Complete reasoning section - extract and stream it
|
407
|
-
parts = self._buffer.split("assistantfinal", 1)
|
408
|
-
reasoning_text = parts[0][len("analysis") :].strip()
|
409
|
-
final_content = parts[1].strip()
|
410
|
-
|
411
|
-
# Clear buffer and return both reasoning and final content
|
412
|
-
self._buffer = ""
|
413
|
-
return StreamingParseResult(
|
414
|
-
reasoning_text=reasoning_text if self.stream_reasoning else "",
|
415
|
-
normal_text=final_content,
|
416
|
-
)
|
417
|
-
elif self.stream_reasoning:
|
418
|
-
# Stream reasoning content incrementally as it arrives
|
419
|
-
current_reasoning = self._buffer[len("analysis") :].strip()
|
420
|
-
self._buffer = ""
|
421
|
-
return StreamingParseResult(reasoning_text=current_reasoning)
|
422
|
-
else:
|
423
|
-
# Wait for assistantfinal
|
424
|
-
return StreamingParseResult()
|
425
|
-
elif self._buffer.startswith("assistantfinal"):
|
426
|
-
# Direct final content without analysis
|
427
|
-
final_content = self._buffer[len("assistantfinal") :].strip()
|
428
|
-
self._buffer = ""
|
429
|
-
return StreamingParseResult(normal_text=final_content)
|
430
|
-
|
431
|
-
# For full channel format, process sections as they complete
|
432
|
-
result = StreamingParseResult()
|
433
|
-
|
434
|
-
# Process complete analysis sections
|
435
|
-
while (
|
436
|
-
self.think_start_token in self._buffer
|
437
|
-
and self.think_end_token in self._buffer
|
438
|
-
):
|
439
|
-
start_idx = self._buffer.find(self.think_start_token)
|
440
|
-
start_pos = start_idx + len(self.think_start_token)
|
441
|
-
end_pos = self._buffer.find(self.think_end_token, start_pos)
|
442
|
-
|
443
|
-
if end_pos != -1:
|
444
|
-
reasoning_content = self._buffer[start_pos:end_pos].strip()
|
445
|
-
if self.stream_reasoning and reasoning_content:
|
446
|
-
result.reasoning_text += reasoning_content
|
447
|
-
|
448
|
-
# Remove processed analysis section
|
449
|
-
self._buffer = (
|
450
|
-
self._buffer[:start_idx]
|
451
|
-
+ self._buffer[end_pos + len(self.think_end_token) :]
|
452
|
-
)
|
453
|
-
else:
|
454
|
-
break
|
455
|
-
|
456
|
-
# Process complete commentary sections
|
457
|
-
commentary_pattern = re.compile(
|
458
|
-
r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
|
459
|
-
re.DOTALL,
|
235
|
+
reasoning_text = "".join(
|
236
|
+
[e.content for e in events if e.event_type == "reasoning"]
|
460
237
|
)
|
238
|
+
normal_parts = []
|
239
|
+
for e in events:
|
240
|
+
if e.event_type == "normal":
|
241
|
+
normal_parts.append(e.content)
|
242
|
+
elif e.event_type == "tool_call":
|
243
|
+
# Use raw_text to preserve structural markers for function call detector
|
244
|
+
normal_parts.append(e.raw_text if e.raw_text else e.content)
|
245
|
+
normal_text = "".join(normal_parts)
|
461
246
|
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
if self.stream_reasoning and commentary_content:
|
467
|
-
result.reasoning_text += commentary_content
|
468
|
-
|
469
|
-
# Remove this commentary section
|
470
|
-
self._buffer = self._buffer[: match.start()] + self._buffer[match.end() :]
|
471
|
-
# Clean up any standalone <|start|>assistant
|
472
|
-
self._buffer = re.sub(
|
473
|
-
r"<\|start\|>assistant(?=<\|start\|>assistant)", "", self._buffer
|
474
|
-
)
|
475
|
-
|
476
|
-
# Handle final channel completion
|
477
|
-
if self.final_channel_start in self._buffer:
|
478
|
-
final_start = self._buffer.find(self.final_channel_start)
|
479
|
-
final_content_start = final_start + len(self.final_channel_start)
|
480
|
-
|
481
|
-
# Check if final channel is complete
|
482
|
-
final_end = self._buffer.find(self.final_channel_end, final_content_start)
|
483
|
-
if final_end != -1:
|
484
|
-
# Complete final channel - process everything
|
485
|
-
final_result = self.detect_and_parse(self._buffer)
|
486
|
-
self._buffer = ""
|
487
|
-
return StreamingParseResult(
|
488
|
-
normal_text=final_result.normal_text,
|
489
|
-
reasoning_text=result.reasoning_text + final_result.reasoning_text,
|
490
|
-
)
|
491
|
-
else:
|
492
|
-
# Extract content before final channel (e.g. tool calls)
|
493
|
-
before_final = self._buffer[:final_start]
|
494
|
-
if before_final:
|
495
|
-
# Output tool calls for processing
|
496
|
-
result.normal_text += before_final
|
497
|
-
# Keep the final channel part in buffer
|
498
|
-
self._buffer = self._buffer[final_start:]
|
499
|
-
|
500
|
-
return result
|
247
|
+
return StreamingParseResult(
|
248
|
+
normal_text=normal_text,
|
249
|
+
reasoning_text=reasoning_text,
|
250
|
+
)
|
501
251
|
|
502
252
|
|
503
253
|
class ReasoningParser:
|
@@ -526,7 +276,7 @@ class ReasoningParser:
|
|
526
276
|
self,
|
527
277
|
model_type: Optional[str] = None,
|
528
278
|
stream_reasoning: bool = True,
|
529
|
-
force_reasoning: bool =
|
279
|
+
force_reasoning: Optional[bool] = None,
|
530
280
|
):
|
531
281
|
if not model_type:
|
532
282
|
raise ValueError("Model type must be specified")
|
@@ -535,19 +285,25 @@ class ReasoningParser:
|
|
535
285
|
if not detector_class:
|
536
286
|
raise ValueError(f"Unsupported model type: {model_type}")
|
537
287
|
|
538
|
-
|
288
|
+
# Special cases where we override force_reasoning
|
289
|
+
if model_type.lower() in {"qwen3-thinking", "gpt-oss"}:
|
539
290
|
force_reasoning = True
|
540
291
|
|
541
|
-
|
542
|
-
|
543
|
-
|
292
|
+
# Only pass force_reasoning if explicitly set, let detectors use their defaults
|
293
|
+
kwargs = {"stream_reasoning": stream_reasoning}
|
294
|
+
if force_reasoning is not None:
|
295
|
+
kwargs["force_reasoning"] = force_reasoning
|
296
|
+
|
297
|
+
self.detector = detector_class(**kwargs)
|
544
298
|
|
545
|
-
def parse_non_stream(self, full_text: str) -> Tuple[str, str]:
|
299
|
+
def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]:
|
546
300
|
"""Non-streaming call: one-time parsing"""
|
547
301
|
ret = self.detector.detect_and_parse(full_text)
|
548
302
|
return ret.reasoning_text, ret.normal_text
|
549
303
|
|
550
|
-
def parse_stream_chunk(
|
304
|
+
def parse_stream_chunk(
|
305
|
+
self, chunk_text: str
|
306
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
551
307
|
"""Streaming call: incremental parsing"""
|
552
308
|
ret = self.detector.parse_streaming_increment(chunk_text)
|
553
309
|
return ret.reasoning_text, ret.normal_text
|
sglang/srt/server_args.py
CHANGED
@@ -25,7 +25,6 @@ from typing import List, Literal, Optional, Union
|
|
25
25
|
|
26
26
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
27
27
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
28
|
-
from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported
|
29
28
|
from sglang.srt.lora.lora_registry import LoRARef
|
30
29
|
from sglang.srt.reasoning_parser import ReasoningParser
|
31
30
|
from sglang.srt.utils import (
|
@@ -39,6 +38,8 @@ from sglang.srt.utils import (
|
|
39
38
|
is_hip,
|
40
39
|
is_port_available,
|
41
40
|
is_remote_url,
|
41
|
+
is_sm90_supported,
|
42
|
+
is_sm100_supported,
|
42
43
|
is_triton_kernels_available,
|
43
44
|
is_valid_ipv6_address,
|
44
45
|
nullable_str,
|
@@ -216,6 +217,7 @@ class ServerArgs:
|
|
216
217
|
hicache_mem_layout: str = "layer_first"
|
217
218
|
hicache_storage_backend: Optional[str] = None
|
218
219
|
hicache_storage_prefetch_policy: str = "best_effort"
|
220
|
+
hicache_storage_backend_extra_config: Optional[str] = None
|
219
221
|
|
220
222
|
# Double Sparsity
|
221
223
|
enable_double_sparsity: bool = False
|
@@ -1641,6 +1643,12 @@ class ServerArgs:
|
|
1641
1643
|
default=ServerArgs.hicache_storage_prefetch_policy,
|
1642
1644
|
help="Control when prefetching from the storage backend should stop.",
|
1643
1645
|
)
|
1646
|
+
parser.add_argument(
|
1647
|
+
"--hicache-storage-backend-extra-config",
|
1648
|
+
type=str,
|
1649
|
+
default=ServerArgs.hicache_storage_backend_extra_config,
|
1650
|
+
help="A dictionary in JSON string format containing extra configuration for the storage backend.",
|
1651
|
+
)
|
1644
1652
|
|
1645
1653
|
# Double Sparsity
|
1646
1654
|
parser.add_argument(
|
@@ -2271,6 +2279,7 @@ class ServerArgs:
|
|
2271
2279
|
if is_mxfp4_quant_format:
|
2272
2280
|
# use bf16 for mxfp4 triton kernels
|
2273
2281
|
self.dtype = "bfloat16"
|
2282
|
+
|
2274
2283
|
elif "Llama4" in model_arch:
|
2275
2284
|
assert self.attention_backend in {
|
2276
2285
|
"fa3",
|
@@ -121,7 +121,12 @@ class TiktokenTokenizer:
|
|
121
121
|
return self.tokenizer.decode_batch(batch)
|
122
122
|
|
123
123
|
def apply_chat_template(
|
124
|
-
self,
|
124
|
+
self,
|
125
|
+
messages,
|
126
|
+
tokenize,
|
127
|
+
add_generation_prompt,
|
128
|
+
tools=None,
|
129
|
+
reasoning_effort=None,
|
125
130
|
):
|
126
131
|
ret = self.chat_template_jinja.render(
|
127
132
|
messages=messages, add_generation_prompt=add_generation_prompt
|
sglang/srt/utils.py
CHANGED
@@ -172,6 +172,20 @@ def is_blackwell():
|
|
172
172
|
return torch.cuda.get_device_capability()[0] == 10
|
173
173
|
|
174
174
|
|
175
|
+
@lru_cache(maxsize=1)
|
176
|
+
def is_sm100_supported(device=None) -> bool:
|
177
|
+
return (torch.cuda.get_device_capability(device)[0] == 10) and (
|
178
|
+
torch.version.cuda >= "12.8"
|
179
|
+
)
|
180
|
+
|
181
|
+
|
182
|
+
@lru_cache(maxsize=1)
|
183
|
+
def is_sm90_supported(device=None) -> bool:
|
184
|
+
return (torch.cuda.get_device_capability(device)[0] == 9) and (
|
185
|
+
torch.version.cuda >= "12.3"
|
186
|
+
)
|
187
|
+
|
188
|
+
|
175
189
|
_warned_bool_env_var_keys = set()
|
176
190
|
|
177
191
|
|
@@ -1665,9 +1679,29 @@ def direct_register_custom_op(
|
|
1665
1679
|
IMPORTANT: the lifetime of the operator is tied to the lifetime of the
|
1666
1680
|
library object. If you want to bind the operator to a different library,
|
1667
1681
|
make sure the library object is alive when the operator is used.
|
1682
|
+
|
1683
|
+
Note: This function will silently skip registration if the operator
|
1684
|
+
with the same name is already registered to avoid RuntimeError in
|
1685
|
+
multi-engine scenarios (e.g., VERL framework).
|
1668
1686
|
"""
|
1669
1687
|
import torch.library
|
1670
1688
|
|
1689
|
+
my_lib = target_lib or sglang_lib
|
1690
|
+
|
1691
|
+
# Check if operator is already registered to avoid duplicate registration
|
1692
|
+
# This is important for scenarios where multiple SGLang engines run in the same process
|
1693
|
+
try:
|
1694
|
+
# Try to access the operator to see if it's already registered
|
1695
|
+
lib_name = my_lib.m.name if hasattr(my_lib.m, "name") else "sglang"
|
1696
|
+
if hasattr(torch.ops, lib_name) and hasattr(
|
1697
|
+
getattr(torch.ops, lib_name), op_name
|
1698
|
+
):
|
1699
|
+
# Operator already exists, skip registration
|
1700
|
+
return
|
1701
|
+
except (AttributeError, RuntimeError):
|
1702
|
+
# Operator doesn't exist, proceed with registration
|
1703
|
+
pass
|
1704
|
+
|
1671
1705
|
if hasattr(torch.library, "infer_schema"):
|
1672
1706
|
schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
|
1673
1707
|
else:
|
@@ -1676,11 +1710,22 @@ def direct_register_custom_op(
|
|
1676
1710
|
|
1677
1711
|
schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
|
1678
1712
|
|
1679
|
-
|
1680
|
-
|
1681
|
-
|
1682
|
-
|
1683
|
-
|
1713
|
+
try:
|
1714
|
+
my_lib.define(op_name + schema_str)
|
1715
|
+
my_lib.impl(op_name, op_func, "CUDA")
|
1716
|
+
if fake_impl is not None:
|
1717
|
+
my_lib._register_fake(op_name, fake_impl)
|
1718
|
+
except RuntimeError as error:
|
1719
|
+
if "Tried to register an operator" in str(e) and "multiple times" in str(e):
|
1720
|
+
# Silently ignore duplicate registration errors
|
1721
|
+
# This can happen in multi-engine scenarios
|
1722
|
+
pass
|
1723
|
+
else:
|
1724
|
+
# Re-raise other RuntimeErrors
|
1725
|
+
raise error
|
1726
|
+
except AttributeError as error:
|
1727
|
+
# Always re-raise AttributeError as it indicates missing dependencies
|
1728
|
+
raise error
|
1684
1729
|
|
1685
1730
|
|
1686
1731
|
def set_gpu_proc_affinity(
|
@@ -1919,6 +1964,15 @@ def get_ip() -> str:
|
|
1919
1964
|
except Exception:
|
1920
1965
|
pass
|
1921
1966
|
|
1967
|
+
# try using hostname
|
1968
|
+
hostname = socket.gethostname()
|
1969
|
+
try:
|
1970
|
+
ip_addr = socket.gethostbyname(hostname)
|
1971
|
+
warnings.warn("using local ip address: {}".format(ip_addr))
|
1972
|
+
return ip_addr
|
1973
|
+
except Exception:
|
1974
|
+
pass
|
1975
|
+
|
1922
1976
|
warnings.warn(
|
1923
1977
|
"Failed to get the IP address, using 0.0.0.0 by default."
|
1924
1978
|
"The value can be set by the environment variable"
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.1.
|
1
|
+
__version__ = "0.5.1.post3"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.5.1.
|
3
|
+
Version: 0.5.1.post3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -257,7 +257,7 @@ Requires-Dist: uvloop; extra == "runtime-common"
|
|
257
257
|
Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
|
258
258
|
Provides-Extra: srt
|
259
259
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
260
|
-
Requires-Dist: sgl-kernel==0.3.
|
260
|
+
Requires-Dist: sgl-kernel==0.3.7; extra == "srt"
|
261
261
|
Requires-Dist: torch==2.8.0; extra == "srt"
|
262
262
|
Requires-Dist: torchaudio==2.8.0; extra == "srt"
|
263
263
|
Requires-Dist: torchvision; extra == "srt"
|
@@ -304,6 +304,7 @@ Requires-Dist: pandas; extra == "test"
|
|
304
304
|
Requires-Dist: peft; extra == "test"
|
305
305
|
Requires-Dist: sentence_transformers; extra == "test"
|
306
306
|
Requires-Dist: pytest; extra == "test"
|
307
|
+
Requires-Dist: tabulate; extra == "test"
|
307
308
|
Provides-Extra: all
|
308
309
|
Requires-Dist: sglang[srt]; extra == "all"
|
309
310
|
Requires-Dist: sglang[openai]; extra == "all"
|
@@ -374,7 +375,7 @@ Dynamic: license-file
|
|
374
375
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
375
376
|
|
376
377
|
## News
|
377
|
-
- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf)).
|
378
|
+
- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
|
378
379
|
- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
|
379
380
|
- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
|
380
381
|
- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|