sglang 0.5.1.post2__py3-none-any.whl → 0.5.1.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. sglang/bench_one_batch_server.py +79 -53
  2. sglang/bench_serving.py +186 -14
  3. sglang/profiler.py +0 -1
  4. sglang/srt/conversation.py +38 -5
  5. sglang/srt/entrypoints/engine.py +1 -1
  6. sglang/srt/entrypoints/openai/protocol.py +27 -24
  7. sglang/srt/entrypoints/openai/serving_chat.py +50 -9
  8. sglang/srt/entrypoints/openai/serving_completions.py +15 -0
  9. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  10. sglang/srt/function_call/function_call_parser.py +2 -0
  11. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  12. sglang/srt/harmony_parser.py +588 -0
  13. sglang/srt/hf_transformers_utils.py +16 -7
  14. sglang/srt/layers/attention/ascend_backend.py +218 -111
  15. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  16. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  17. sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
  18. sglang/srt/layers/communicator.py +1 -2
  19. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  20. sglang/srt/layers/moe/ep_moe/layer.py +1 -7
  21. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  22. sglang/srt/layers/moe/topk.py +1 -1
  23. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
  24. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -7
  25. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
  26. sglang/srt/layers/quantization/fp8.py +2 -1
  27. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  28. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  29. sglang/srt/layers/quantization/mxfp4.py +16 -23
  30. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  31. sglang/srt/layers/utils.py +0 -14
  32. sglang/srt/managers/cache_controller.py +223 -156
  33. sglang/srt/managers/detokenizer_manager.py +5 -0
  34. sglang/srt/managers/io_struct.py +30 -0
  35. sglang/srt/managers/scheduler.py +58 -7
  36. sglang/srt/managers/tokenizer_manager.py +36 -3
  37. sglang/srt/mem_cache/hicache_storage.py +31 -20
  38. sglang/srt/mem_cache/hiradix_cache.py +12 -3
  39. sglang/srt/mem_cache/memory_pool.py +73 -14
  40. sglang/srt/mem_cache/memory_pool_host.py +3 -2
  41. sglang/srt/mem_cache/radix_cache.py +1 -0
  42. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +5 -13
  43. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +85 -81
  44. sglang/srt/model_executor/model_runner.py +1 -1
  45. sglang/srt/models/deepseek_v2.py +12 -3
  46. sglang/srt/models/gpt_oss.py +2 -1
  47. sglang/srt/models/qwen2_5_vl.py +1 -0
  48. sglang/srt/reasoning_parser.py +56 -300
  49. sglang/srt/server_args.py +10 -1
  50. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  51. sglang/srt/utils.py +59 -5
  52. sglang/version.py +1 -1
  53. {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/METADATA +4 -3
  54. {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/RECORD +57 -54
  55. {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/WHEEL +0 -0
  56. {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/licenses/LICENSE +0 -0
  57. {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,19 @@
1
1
  import re
2
2
  from typing import Dict, Optional, Tuple, Type
3
3
 
4
+ from sglang.srt.harmony_parser import HarmonyParser
5
+
4
6
 
5
7
  class StreamingParseResult:
6
8
  """Result of streaming incremental parsing."""
7
9
 
8
- def __init__(self, normal_text: str = "", reasoning_text: str = ""):
9
- self.normal_text = normal_text
10
- self.reasoning_text = reasoning_text
10
+ def __init__(
11
+ self,
12
+ normal_text: Optional[str] = None,
13
+ reasoning_text: Optional[str] = None,
14
+ ):
15
+ self.normal_text = normal_text or ""
16
+ self.reasoning_text = reasoning_text or ""
11
17
 
12
18
 
13
19
  class BaseReasoningFormatDetector:
@@ -188,316 +194,60 @@ class KimiDetector(BaseReasoningFormatDetector):
188
194
 
189
195
  class GptOssDetector(BaseReasoningFormatDetector):
190
196
  """
191
- Detector for T4-style reasoning format.
192
-
193
- Assumes reasoning format with two channels:
194
- <|channel|>analysis<|message|>...reasoning content...<|end|>
195
- <|start|>assistant<|channel|>final<|message|>...final answer...<|return|>
196
-
197
- Returns content from 'analysis' channel as reasoning_text
198
- and content from 'final' channel as normal_text.
199
-
200
- Args:
201
- stream_reasoning (bool): If False, accumulates reasoning content until complete.
202
- If True, streams reasoning content as it arrives.
197
+ Detector for T4-style reasoning format (GPT-OSS), using the HarmonyParser.
203
198
  """
204
199
 
205
200
  def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
206
- # TypeScript uses channel tokens instead of simple start/end tokens
207
201
  super().__init__(
208
202
  "<|channel|>analysis<|message|>",
209
203
  "<|end|>",
210
- force_reasoning=True,
204
+ force_reasoning=force_reasoning,
211
205
  stream_reasoning=stream_reasoning,
212
206
  )
213
- self.final_channel_start = "<|start|>assistant<|channel|>final<|message|>"
214
- self.final_channel_end = "<|return|>"
215
- self._in_final_channel = False
216
- self._analysis_complete = False
217
- self._in_reasoning = True
207
+ self.parser = HarmonyParser()
218
208
 
219
209
  def detect_and_parse(self, text: str) -> StreamingParseResult:
220
- """
221
- One-time parsing: Detects and parses both analysis and final channels.
222
- Tool call channels are preserved in normal_text for downstream processing.
210
+ events = self.parser.parse(text)
211
+ # Flush the buffer for one-shot parsing
212
+ events += self.parser.parse("")
223
213
 
224
- HACK: Also handles simplified format where text starts with "analysis" and transitions
225
- to "assistantfinal" without full channel markers.
226
- """
227
- # HACK: Handle simplified format (analysis...assistantfinal) without channel markers
228
- if (
229
- text.startswith("analysis")
230
- and "assistantfinal" in text
231
- and "<|channel|>" not in text
232
- ):
233
- # Split on "assistantfinal"
234
- parts = text.split("assistantfinal", 1)
235
- self._in_reasoning = False
236
- if len(parts) == 2:
237
- reasoning_text = parts[0][
238
- len("analysis") :
239
- ].strip() # Remove "analysis" prefix
240
- normal_text = parts[1].strip()
241
- return StreamingParseResult(
242
- normal_text=normal_text, reasoning_text=reasoning_text
243
- )
244
-
245
- reasoning_parts = []
246
- normal_parts = []
247
- current_pos = 0
248
-
249
- # Process text sequentially to preserve tool calls between analysis sections
250
- while current_pos < len(text):
251
- # Look for next analysis channel
252
- analysis_start_idx = text.find(self.think_start_token, current_pos)
253
-
254
- if analysis_start_idx == -1:
255
- # No more analysis channels, rest goes to remaining
256
- break
257
-
258
- # Preserve any content before this analysis channel (could include tool calls)
259
- if analysis_start_idx > current_pos:
260
- between_content = text[current_pos:analysis_start_idx]
261
- # This content will be added to normal_parts later
262
- normal_parts.append(between_content)
263
-
264
- # Extract analysis content
265
- analysis_content_start = analysis_start_idx + len(self.think_start_token)
266
- analysis_end_idx = text.find(self.think_end_token, analysis_content_start)
267
-
268
- if analysis_end_idx != -1:
269
- reasoning_parts.append(
270
- text[analysis_content_start:analysis_end_idx].strip()
271
- )
272
- current_pos = analysis_end_idx + len(self.think_end_token)
273
- else:
274
- # Analysis not complete
275
- reasoning_parts.append(text[analysis_content_start:].strip())
276
- reasoning_text = "".join(reasoning_parts)
277
- return StreamingParseResult(reasoning_text=reasoning_text)
278
-
279
- # Add any remaining text after all analysis sections
280
- if current_pos < len(text):
281
- remaining = text[current_pos:]
282
- normal_parts.append(remaining)
283
-
284
- # Process non-analysis content for commentary sections
285
- full_normal_text = "".join(normal_parts)
286
-
287
- # Extract reasoning from non-tool-call commentary sections
288
- # Tool calls have "to=" in their header, regular commentary does not
289
- commentary_pattern = re.compile(
290
- r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
291
- re.DOTALL,
214
+ reasoning_text = "".join(
215
+ [e.content for e in events if e.event_type == "reasoning"]
292
216
  )
293
-
294
- cleaned_text = full_normal_text
295
- for match in reversed(list(commentary_pattern.finditer(full_normal_text))):
296
- # Check if this commentary is a tool call by looking at the text before <|message|>
297
- match_start = match.start()
298
- # Find where "<|channel|>commentary" starts within the matched pattern
299
- # The pattern starts with "<|start|>assistant<|channel|>commentary"
300
- # So we look for the text between "commentary" and "<|message|>" in the match
301
- match_text = full_normal_text[match_start : match.end()]
302
- commentary_idx = match_text.find("<|channel|>commentary")
303
- if commentary_idx != -1:
304
- message_idx = match_text.find("<|message|>", commentary_idx)
305
- if message_idx != -1:
306
- between_text = match_text[commentary_idx:message_idx]
307
- # If no "to=" found, this is regular commentary (reasoning content)
308
- if " to=" not in between_text:
309
- content = match.group(1).strip()
310
- reasoning_parts.append(content)
311
- # Remove this commentary section from normal text
312
- cleaned_text = (
313
- cleaned_text[: match.start()] + cleaned_text[match.end() :]
314
- )
315
-
316
- full_normal_text = cleaned_text
317
-
318
- # Combine all reasoning parts
319
- reasoning_text = "".join(reasoning_parts)
320
-
321
- # Process full_normal_text for final output
322
- normal_text = ""
323
- if self.final_channel_start in full_normal_text:
324
- final_start = full_normal_text.find(self.final_channel_start)
325
- final_content_start = final_start + len(self.final_channel_start)
326
- final_end = full_normal_text.find(
327
- self.final_channel_end, final_content_start
328
- )
329
-
330
- if final_end != -1:
331
- # Extract content before final channel (includes tool calls)
332
- before_final = full_normal_text[:final_start].strip()
333
- # Extract ONLY the final channel content (not the channel markers)
334
- final_text = full_normal_text[final_content_start:final_end].strip()
335
- # Extract content after final channel
336
- after_final = full_normal_text[
337
- final_end + len(self.final_channel_end) :
338
- ].strip()
339
-
340
- # For tool calls + final answer: concatenate tool calls with final text
341
- parts = []
342
- if before_final:
343
- parts.append(before_final)
344
- if final_text:
345
- parts.append(final_text)
346
- if after_final:
347
- parts.append(after_final)
348
- normal_text = " ".join(parts)
349
- else:
350
- # Final channel not complete - extract what we have
351
- # Look for just <|channel|>final<|message|> without <|return|>
352
- alt_final_start = full_normal_text.find("<|channel|>final<|message|>")
353
- if alt_final_start != -1:
354
- before_alt_final = full_normal_text[:alt_final_start].strip()
355
- alt_final_content = full_normal_text[
356
- alt_final_start + len("<|channel|>final<|message|>") :
357
- ].strip()
358
-
359
- parts = []
360
- if before_alt_final:
361
- parts.append(before_alt_final)
362
- if alt_final_content:
363
- parts.append(alt_final_content)
364
- normal_text = " ".join(parts)
365
- else:
366
- normal_text = full_normal_text.strip()
367
- else:
368
- # No final channel, treat all as normal text (includes tool calls)
369
- normal_text = full_normal_text.strip()
217
+ normal_parts = []
218
+ for e in events:
219
+ if e.event_type == "normal":
220
+ normal_parts.append(e.content)
221
+ elif e.event_type == "tool_call":
222
+ # Use raw_text to preserve structural markers for function call detector
223
+ normal_parts.append(e.raw_text if e.raw_text else e.content)
224
+ normal_text = "".join(normal_parts)
225
+ # Tool call events preserve raw text with structural markers
370
226
 
371
227
  return StreamingParseResult(
372
- normal_text=normal_text, reasoning_text=reasoning_text
228
+ normal_text=normal_text,
229
+ reasoning_text=reasoning_text,
373
230
  )
374
231
 
375
232
  def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
376
- """
377
- Streaming incremental parsing for GPT-OSS format.
233
+ events = self.parser.parse(new_text)
378
234
 
379
- This is a simplified streaming implementation that accumulates content
380
- and delegates to the non-streaming parser for complex multi-channel parsing.
381
- TODO: Implement proper incremental parsing for better streaming performance.
382
- """
383
- self._buffer += new_text
384
-
385
- if not self._in_reasoning:
386
- return StreamingParseResult(normal_text=new_text)
387
-
388
- # Check if we have complete sections to process
389
- # For GPT-OSS, we need to wait for complete channel sections
390
- # HACK: For now, use simplified approach - wait for key markers before processing
391
- key_markers = ["<|end|>", "<|call|>", "<|return|>", "assistantfinal"]
392
- has_complete_section = any(marker in self._buffer for marker in key_markers)
393
-
394
- if not has_complete_section:
395
- # Still accumulating, don't process yet
396
- return StreamingParseResult()
397
-
398
- # Handle simplified format (analysis...assistantfinal) with true incremental streaming
399
- if (
400
- "<|channel|>" not in self._buffer
401
- ): # Simplified format without channel markers
402
- if self._buffer.startswith("analysis"):
403
- # Check if we have the transition to assistantfinal
404
- if "assistantfinal" in self._buffer:
405
- self._in_reasoning = False
406
- # Complete reasoning section - extract and stream it
407
- parts = self._buffer.split("assistantfinal", 1)
408
- reasoning_text = parts[0][len("analysis") :].strip()
409
- final_content = parts[1].strip()
410
-
411
- # Clear buffer and return both reasoning and final content
412
- self._buffer = ""
413
- return StreamingParseResult(
414
- reasoning_text=reasoning_text if self.stream_reasoning else "",
415
- normal_text=final_content,
416
- )
417
- elif self.stream_reasoning:
418
- # Stream reasoning content incrementally as it arrives
419
- current_reasoning = self._buffer[len("analysis") :].strip()
420
- self._buffer = ""
421
- return StreamingParseResult(reasoning_text=current_reasoning)
422
- else:
423
- # Wait for assistantfinal
424
- return StreamingParseResult()
425
- elif self._buffer.startswith("assistantfinal"):
426
- # Direct final content without analysis
427
- final_content = self._buffer[len("assistantfinal") :].strip()
428
- self._buffer = ""
429
- return StreamingParseResult(normal_text=final_content)
430
-
431
- # For full channel format, process sections as they complete
432
- result = StreamingParseResult()
433
-
434
- # Process complete analysis sections
435
- while (
436
- self.think_start_token in self._buffer
437
- and self.think_end_token in self._buffer
438
- ):
439
- start_idx = self._buffer.find(self.think_start_token)
440
- start_pos = start_idx + len(self.think_start_token)
441
- end_pos = self._buffer.find(self.think_end_token, start_pos)
442
-
443
- if end_pos != -1:
444
- reasoning_content = self._buffer[start_pos:end_pos].strip()
445
- if self.stream_reasoning and reasoning_content:
446
- result.reasoning_text += reasoning_content
447
-
448
- # Remove processed analysis section
449
- self._buffer = (
450
- self._buffer[:start_idx]
451
- + self._buffer[end_pos + len(self.think_end_token) :]
452
- )
453
- else:
454
- break
455
-
456
- # Process complete commentary sections
457
- commentary_pattern = re.compile(
458
- r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
459
- re.DOTALL,
235
+ reasoning_text = "".join(
236
+ [e.content for e in events if e.event_type == "reasoning"]
460
237
  )
238
+ normal_parts = []
239
+ for e in events:
240
+ if e.event_type == "normal":
241
+ normal_parts.append(e.content)
242
+ elif e.event_type == "tool_call":
243
+ # Use raw_text to preserve structural markers for function call detector
244
+ normal_parts.append(e.raw_text if e.raw_text else e.content)
245
+ normal_text = "".join(normal_parts)
461
246
 
462
- for match in reversed(list(commentary_pattern.finditer(self._buffer))):
463
- # Check if this is a tool call
464
- start_pos = match.start()
465
- commentary_content = match.group(1).strip()
466
- if self.stream_reasoning and commentary_content:
467
- result.reasoning_text += commentary_content
468
-
469
- # Remove this commentary section
470
- self._buffer = self._buffer[: match.start()] + self._buffer[match.end() :]
471
- # Clean up any standalone <|start|>assistant
472
- self._buffer = re.sub(
473
- r"<\|start\|>assistant(?=<\|start\|>assistant)", "", self._buffer
474
- )
475
-
476
- # Handle final channel completion
477
- if self.final_channel_start in self._buffer:
478
- final_start = self._buffer.find(self.final_channel_start)
479
- final_content_start = final_start + len(self.final_channel_start)
480
-
481
- # Check if final channel is complete
482
- final_end = self._buffer.find(self.final_channel_end, final_content_start)
483
- if final_end != -1:
484
- # Complete final channel - process everything
485
- final_result = self.detect_and_parse(self._buffer)
486
- self._buffer = ""
487
- return StreamingParseResult(
488
- normal_text=final_result.normal_text,
489
- reasoning_text=result.reasoning_text + final_result.reasoning_text,
490
- )
491
- else:
492
- # Extract content before final channel (e.g. tool calls)
493
- before_final = self._buffer[:final_start]
494
- if before_final:
495
- # Output tool calls for processing
496
- result.normal_text += before_final
497
- # Keep the final channel part in buffer
498
- self._buffer = self._buffer[final_start:]
499
-
500
- return result
247
+ return StreamingParseResult(
248
+ normal_text=normal_text,
249
+ reasoning_text=reasoning_text,
250
+ )
501
251
 
502
252
 
503
253
  class ReasoningParser:
@@ -526,7 +276,7 @@ class ReasoningParser:
526
276
  self,
527
277
  model_type: Optional[str] = None,
528
278
  stream_reasoning: bool = True,
529
- force_reasoning: bool = False,
279
+ force_reasoning: Optional[bool] = None,
530
280
  ):
531
281
  if not model_type:
532
282
  raise ValueError("Model type must be specified")
@@ -535,19 +285,25 @@ class ReasoningParser:
535
285
  if not detector_class:
536
286
  raise ValueError(f"Unsupported model type: {model_type}")
537
287
 
538
- if model_type.lower() == "qwen3-thinking":
288
+ # Special cases where we override force_reasoning
289
+ if model_type.lower() in {"qwen3-thinking", "gpt-oss"}:
539
290
  force_reasoning = True
540
291
 
541
- self.detector = detector_class(
542
- stream_reasoning=stream_reasoning, force_reasoning=force_reasoning
543
- )
292
+ # Only pass force_reasoning if explicitly set, let detectors use their defaults
293
+ kwargs = {"stream_reasoning": stream_reasoning}
294
+ if force_reasoning is not None:
295
+ kwargs["force_reasoning"] = force_reasoning
296
+
297
+ self.detector = detector_class(**kwargs)
544
298
 
545
- def parse_non_stream(self, full_text: str) -> Tuple[str, str]:
299
+ def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]:
546
300
  """Non-streaming call: one-time parsing"""
547
301
  ret = self.detector.detect_and_parse(full_text)
548
302
  return ret.reasoning_text, ret.normal_text
549
303
 
550
- def parse_stream_chunk(self, chunk_text: str) -> Tuple[str, str]:
304
+ def parse_stream_chunk(
305
+ self, chunk_text: str
306
+ ) -> Tuple[Optional[str], Optional[str]]:
551
307
  """Streaming call: incremental parsing"""
552
308
  ret = self.detector.parse_streaming_increment(chunk_text)
553
309
  return ret.reasoning_text, ret.normal_text
sglang/srt/server_args.py CHANGED
@@ -25,7 +25,6 @@ from typing import List, Literal, Optional, Union
25
25
 
26
26
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
27
27
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
28
- from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported
29
28
  from sglang.srt.lora.lora_registry import LoRARef
30
29
  from sglang.srt.reasoning_parser import ReasoningParser
31
30
  from sglang.srt.utils import (
@@ -39,6 +38,8 @@ from sglang.srt.utils import (
39
38
  is_hip,
40
39
  is_port_available,
41
40
  is_remote_url,
41
+ is_sm90_supported,
42
+ is_sm100_supported,
42
43
  is_triton_kernels_available,
43
44
  is_valid_ipv6_address,
44
45
  nullable_str,
@@ -216,6 +217,7 @@ class ServerArgs:
216
217
  hicache_mem_layout: str = "layer_first"
217
218
  hicache_storage_backend: Optional[str] = None
218
219
  hicache_storage_prefetch_policy: str = "best_effort"
220
+ hicache_storage_backend_extra_config: Optional[str] = None
219
221
 
220
222
  # Double Sparsity
221
223
  enable_double_sparsity: bool = False
@@ -1641,6 +1643,12 @@ class ServerArgs:
1641
1643
  default=ServerArgs.hicache_storage_prefetch_policy,
1642
1644
  help="Control when prefetching from the storage backend should stop.",
1643
1645
  )
1646
+ parser.add_argument(
1647
+ "--hicache-storage-backend-extra-config",
1648
+ type=str,
1649
+ default=ServerArgs.hicache_storage_backend_extra_config,
1650
+ help="A dictionary in JSON string format containing extra configuration for the storage backend.",
1651
+ )
1644
1652
 
1645
1653
  # Double Sparsity
1646
1654
  parser.add_argument(
@@ -2271,6 +2279,7 @@ class ServerArgs:
2271
2279
  if is_mxfp4_quant_format:
2272
2280
  # use bf16 for mxfp4 triton kernels
2273
2281
  self.dtype = "bfloat16"
2282
+
2274
2283
  elif "Llama4" in model_arch:
2275
2284
  assert self.attention_backend in {
2276
2285
  "fa3",
@@ -121,7 +121,12 @@ class TiktokenTokenizer:
121
121
  return self.tokenizer.decode_batch(batch)
122
122
 
123
123
  def apply_chat_template(
124
- self, messages, tokenize, add_generation_prompt, tools=None
124
+ self,
125
+ messages,
126
+ tokenize,
127
+ add_generation_prompt,
128
+ tools=None,
129
+ reasoning_effort=None,
125
130
  ):
126
131
  ret = self.chat_template_jinja.render(
127
132
  messages=messages, add_generation_prompt=add_generation_prompt
sglang/srt/utils.py CHANGED
@@ -172,6 +172,20 @@ def is_blackwell():
172
172
  return torch.cuda.get_device_capability()[0] == 10
173
173
 
174
174
 
175
+ @lru_cache(maxsize=1)
176
+ def is_sm100_supported(device=None) -> bool:
177
+ return (torch.cuda.get_device_capability(device)[0] == 10) and (
178
+ torch.version.cuda >= "12.8"
179
+ )
180
+
181
+
182
+ @lru_cache(maxsize=1)
183
+ def is_sm90_supported(device=None) -> bool:
184
+ return (torch.cuda.get_device_capability(device)[0] == 9) and (
185
+ torch.version.cuda >= "12.3"
186
+ )
187
+
188
+
175
189
  _warned_bool_env_var_keys = set()
176
190
 
177
191
 
@@ -1665,9 +1679,29 @@ def direct_register_custom_op(
1665
1679
  IMPORTANT: the lifetime of the operator is tied to the lifetime of the
1666
1680
  library object. If you want to bind the operator to a different library,
1667
1681
  make sure the library object is alive when the operator is used.
1682
+
1683
+ Note: This function will silently skip registration if the operator
1684
+ with the same name is already registered to avoid RuntimeError in
1685
+ multi-engine scenarios (e.g., VERL framework).
1668
1686
  """
1669
1687
  import torch.library
1670
1688
 
1689
+ my_lib = target_lib or sglang_lib
1690
+
1691
+ # Check if operator is already registered to avoid duplicate registration
1692
+ # This is important for scenarios where multiple SGLang engines run in the same process
1693
+ try:
1694
+ # Try to access the operator to see if it's already registered
1695
+ lib_name = my_lib.m.name if hasattr(my_lib.m, "name") else "sglang"
1696
+ if hasattr(torch.ops, lib_name) and hasattr(
1697
+ getattr(torch.ops, lib_name), op_name
1698
+ ):
1699
+ # Operator already exists, skip registration
1700
+ return
1701
+ except (AttributeError, RuntimeError):
1702
+ # Operator doesn't exist, proceed with registration
1703
+ pass
1704
+
1671
1705
  if hasattr(torch.library, "infer_schema"):
1672
1706
  schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
1673
1707
  else:
@@ -1676,11 +1710,22 @@ def direct_register_custom_op(
1676
1710
 
1677
1711
  schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
1678
1712
 
1679
- my_lib = target_lib or sglang_lib
1680
- my_lib.define(op_name + schema_str)
1681
- my_lib.impl(op_name, op_func, "CUDA")
1682
- if fake_impl is not None:
1683
- my_lib._register_fake(op_name, fake_impl)
1713
+ try:
1714
+ my_lib.define(op_name + schema_str)
1715
+ my_lib.impl(op_name, op_func, "CUDA")
1716
+ if fake_impl is not None:
1717
+ my_lib._register_fake(op_name, fake_impl)
1718
+ except RuntimeError as error:
1719
+ if "Tried to register an operator" in str(e) and "multiple times" in str(e):
1720
+ # Silently ignore duplicate registration errors
1721
+ # This can happen in multi-engine scenarios
1722
+ pass
1723
+ else:
1724
+ # Re-raise other RuntimeErrors
1725
+ raise error
1726
+ except AttributeError as error:
1727
+ # Always re-raise AttributeError as it indicates missing dependencies
1728
+ raise error
1684
1729
 
1685
1730
 
1686
1731
  def set_gpu_proc_affinity(
@@ -1919,6 +1964,15 @@ def get_ip() -> str:
1919
1964
  except Exception:
1920
1965
  pass
1921
1966
 
1967
+ # try using hostname
1968
+ hostname = socket.gethostname()
1969
+ try:
1970
+ ip_addr = socket.gethostbyname(hostname)
1971
+ warnings.warn("using local ip address: {}".format(ip_addr))
1972
+ return ip_addr
1973
+ except Exception:
1974
+ pass
1975
+
1922
1976
  warnings.warn(
1923
1977
  "Failed to get the IP address, using 0.0.0.0 by default."
1924
1978
  "The value can be set by the environment variable"
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.1.post2"
1
+ __version__ = "0.5.1.post3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.1.post2
3
+ Version: 0.5.1.post3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -257,7 +257,7 @@ Requires-Dist: uvloop; extra == "runtime-common"
257
257
  Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
258
258
  Provides-Extra: srt
259
259
  Requires-Dist: sglang[runtime_common]; extra == "srt"
260
- Requires-Dist: sgl-kernel==0.3.5; extra == "srt"
260
+ Requires-Dist: sgl-kernel==0.3.7; extra == "srt"
261
261
  Requires-Dist: torch==2.8.0; extra == "srt"
262
262
  Requires-Dist: torchaudio==2.8.0; extra == "srt"
263
263
  Requires-Dist: torchvision; extra == "srt"
@@ -304,6 +304,7 @@ Requires-Dist: pandas; extra == "test"
304
304
  Requires-Dist: peft; extra == "test"
305
305
  Requires-Dist: sentence_transformers; extra == "test"
306
306
  Requires-Dist: pytest; extra == "test"
307
+ Requires-Dist: tabulate; extra == "test"
307
308
  Provides-Extra: all
308
309
  Requires-Dist: sglang[srt]; extra == "all"
309
310
  Requires-Dist: sglang[openai]; extra == "all"
@@ -374,7 +375,7 @@ Dynamic: license-file
374
375
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
375
376
 
376
377
  ## News
377
- - [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf)).
378
+ - [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
378
379
  - [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
379
380
  - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
380
381
  - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).