sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +79 -53
  3. sglang/bench_serving.py +186 -14
  4. sglang/profiler.py +0 -1
  5. sglang/srt/configs/__init__.py +2 -0
  6. sglang/srt/configs/longcat_flash.py +104 -0
  7. sglang/srt/configs/model_config.py +12 -0
  8. sglang/srt/connector/__init__.py +1 -1
  9. sglang/srt/connector/base_connector.py +1 -2
  10. sglang/srt/connector/redis.py +2 -2
  11. sglang/srt/connector/serde/__init__.py +1 -1
  12. sglang/srt/connector/serde/safe_serde.py +4 -3
  13. sglang/srt/conversation.py +38 -5
  14. sglang/srt/disaggregation/ascend/conn.py +75 -0
  15. sglang/srt/disaggregation/launch_lb.py +0 -13
  16. sglang/srt/disaggregation/mini_lb.py +33 -8
  17. sglang/srt/disaggregation/prefill.py +1 -1
  18. sglang/srt/distributed/parallel_state.py +24 -14
  19. sglang/srt/entrypoints/engine.py +19 -12
  20. sglang/srt/entrypoints/http_server.py +174 -34
  21. sglang/srt/entrypoints/openai/protocol.py +87 -24
  22. sglang/srt/entrypoints/openai/serving_chat.py +50 -9
  23. sglang/srt/entrypoints/openai/serving_completions.py +15 -0
  24. sglang/srt/eplb/eplb_manager.py +26 -2
  25. sglang/srt/eplb/expert_distribution.py +29 -2
  26. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  27. sglang/srt/function_call/function_call_parser.py +2 -0
  28. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  29. sglang/srt/harmony_parser.py +588 -0
  30. sglang/srt/hf_transformers_utils.py +26 -7
  31. sglang/srt/layers/activation.py +12 -0
  32. sglang/srt/layers/attention/ascend_backend.py +374 -136
  33. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  34. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  35. sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
  36. sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
  37. sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
  38. sglang/srt/layers/communicator.py +1 -2
  39. sglang/srt/layers/layernorm.py +28 -3
  40. sglang/srt/layers/linear.py +3 -2
  41. sglang/srt/layers/logits_processor.py +1 -1
  42. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  43. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  44. sglang/srt/layers/moe/ep_moe/layer.py +13 -13
  45. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  47. sglang/srt/layers/moe/topk.py +35 -12
  48. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
  49. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  50. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
  51. sglang/srt/layers/quantization/fp8.py +2 -1
  52. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  53. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  54. sglang/srt/layers/quantization/modelopt_quant.py +7 -0
  55. sglang/srt/layers/quantization/mxfp4.py +25 -27
  56. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  57. sglang/srt/layers/quantization/utils.py +13 -0
  58. sglang/srt/layers/quantization/w8a8_int8.py +7 -3
  59. sglang/srt/layers/rotary_embedding.py +28 -1
  60. sglang/srt/layers/sampler.py +29 -5
  61. sglang/srt/layers/utils.py +0 -14
  62. sglang/srt/managers/cache_controller.py +237 -204
  63. sglang/srt/managers/detokenizer_manager.py +48 -2
  64. sglang/srt/managers/io_struct.py +57 -0
  65. sglang/srt/managers/mm_utils.py +5 -1
  66. sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
  67. sglang/srt/managers/scheduler.py +94 -9
  68. sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
  69. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  70. sglang/srt/managers/tokenizer_manager.py +122 -42
  71. sglang/srt/mem_cache/chunk_cache.py +1 -1
  72. sglang/srt/mem_cache/hicache_storage.py +51 -23
  73. sglang/srt/mem_cache/hiradix_cache.py +87 -71
  74. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  75. sglang/srt/mem_cache/memory_pool.py +77 -14
  76. sglang/srt/mem_cache/memory_pool_host.py +4 -5
  77. sglang/srt/mem_cache/radix_cache.py +6 -4
  78. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  79. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
  80. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
  81. sglang/srt/mem_cache/swa_radix_cache.py +1 -1
  82. sglang/srt/model_executor/model_runner.py +6 -5
  83. sglang/srt/model_loader/loader.py +15 -24
  84. sglang/srt/model_loader/utils.py +12 -0
  85. sglang/srt/models/deepseek_v2.py +38 -13
  86. sglang/srt/models/gpt_oss.py +2 -15
  87. sglang/srt/models/llama_eagle3.py +4 -0
  88. sglang/srt/models/longcat_flash.py +1015 -0
  89. sglang/srt/models/longcat_flash_nextn.py +691 -0
  90. sglang/srt/models/qwen2.py +26 -3
  91. sglang/srt/models/qwen2_5_vl.py +66 -41
  92. sglang/srt/models/qwen2_moe.py +22 -2
  93. sglang/srt/models/transformers.py +1 -1
  94. sglang/srt/multimodal/processors/base_processor.py +4 -2
  95. sglang/srt/reasoning_parser.py +56 -300
  96. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  97. sglang/srt/server_args.py +122 -56
  98. sglang/srt/speculative/eagle_worker.py +28 -8
  99. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  100. sglang/srt/utils.py +73 -5
  101. sglang/test/attention/test_trtllm_mla_backend.py +12 -3
  102. sglang/version.py +1 -1
  103. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
  104. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
  105. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
  106. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
  107. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -25,7 +25,6 @@ from typing import List, Literal, Optional, Union
25
25
 
26
26
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
27
27
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
28
- from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported
29
28
  from sglang.srt.lora.lora_registry import LoRARef
30
29
  from sglang.srt.reasoning_parser import ReasoningParser
31
30
  from sglang.srt.utils import (
@@ -39,6 +38,8 @@ from sglang.srt.utils import (
39
38
  is_hip,
40
39
  is_port_available,
41
40
  is_remote_url,
41
+ is_sm90_supported,
42
+ is_sm100_supported,
42
43
  is_triton_kernels_available,
43
44
  is_valid_ipv6_address,
44
45
  nullable_str,
@@ -47,12 +48,87 @@ from sglang.srt.utils import (
47
48
  logger = logging.getLogger(__name__)
48
49
 
49
50
 
51
+ # Define constants
52
+ LOAD_FORMAT_CHOICES = [
53
+ "auto",
54
+ "pt",
55
+ "safetensors",
56
+ "npcache",
57
+ "dummy",
58
+ "sharded_state",
59
+ "gguf",
60
+ "bitsandbytes",
61
+ "layered",
62
+ "remote",
63
+ ]
64
+
65
+ QUANTIZATION_CHOICES = [
66
+ "awq",
67
+ "fp8",
68
+ "gptq",
69
+ "marlin",
70
+ "gptq_marlin",
71
+ "awq_marlin",
72
+ "bitsandbytes",
73
+ "gguf",
74
+ "modelopt",
75
+ "modelopt_fp4",
76
+ "petit_nvfp4",
77
+ "w8a8_int8",
78
+ "w8a8_fp8",
79
+ "moe_wna16",
80
+ "qoq",
81
+ "w4afp8",
82
+ "mxfp4",
83
+ ]
84
+
85
+ ATTENTION_BACKEND_CHOICES = [
86
+ # Common
87
+ "triton",
88
+ "torch_native",
89
+ # NVIDIA specific
90
+ "cutlass_mla",
91
+ "fa3",
92
+ "flashinfer",
93
+ "flashmla",
94
+ "trtllm_mla",
95
+ "trtllm_mha",
96
+ "dual_chunk_flash_attn",
97
+ # AMD specific
98
+ "aiter",
99
+ "wave",
100
+ # Other platforms
101
+ "intel_amx",
102
+ "ascend",
103
+ ]
104
+
105
+ DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
106
+
107
+
108
+ # Allow external code to add more choices
109
+ def add_load_format_choices(choices):
110
+ LOAD_FORMAT_CHOICES.extend(choices)
111
+
112
+
113
+ def add_quantization_method_choices(choices):
114
+ QUANTIZATION_CHOICES.extend(choices)
115
+
116
+
117
+ def add_attention_backend_choices(choices):
118
+ ATTENTION_BACKEND_CHOICES.extend(choices)
119
+
120
+
121
+ def add_disagg_transfer_backend_choices(choices):
122
+ DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
123
+
124
+
50
125
  @dataclasses.dataclass
51
126
  class ServerArgs:
52
127
  # Model and tokenizer
53
128
  model_path: str
54
129
  tokenizer_path: Optional[str] = None
55
130
  tokenizer_mode: str = "auto"
131
+ tokenizer_worker_num: int = 1
56
132
  skip_tokenizer_init: bool = False
57
133
  load_format: str = "auto"
58
134
  model_loader_extra_config: str = "{}"
@@ -199,6 +275,7 @@ class ServerArgs:
199
275
  eplb_algorithm: str = "auto"
200
276
  eplb_rebalance_num_iterations: int = 1000
201
277
  eplb_rebalance_layers_per_chunk: Optional[int] = None
278
+ eplb_min_rebalancing_utilization_threshold: float = 1.0
202
279
  expert_distribution_recorder_mode: Optional[
203
280
  Literal["stat", "stat_approx", "per_pass", "per_token"]
204
281
  ] = None
@@ -211,11 +288,12 @@ class ServerArgs:
211
288
  enable_hierarchical_cache: bool = False
212
289
  hicache_ratio: float = 2.0
213
290
  hicache_size: int = 0
214
- hicache_write_policy: str = "write_through_selective"
291
+ hicache_write_policy: str = "write_through"
215
292
  hicache_io_backend: str = "kernel"
216
293
  hicache_mem_layout: str = "layer_first"
217
294
  hicache_storage_backend: Optional[str] = None
218
295
  hicache_storage_prefetch_policy: str = "best_effort"
296
+ hicache_storage_backend_extra_config: Optional[str] = None
219
297
 
220
298
  # Double Sparsity
221
299
  enable_double_sparsity: bool = False
@@ -671,6 +749,15 @@ class ServerArgs:
671
749
  )
672
750
  self.speculative_num_draft_tokens = self.speculative_num_steps + 1
673
751
 
752
+ if (
753
+ self.speculative_eagle_topk > 1
754
+ and self.page_size > 1
755
+ and self.attention_backend != "flashinfer"
756
+ ):
757
+ raise ValueError(
758
+ "speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
759
+ )
760
+
674
761
  # The token generated from the verify step is counted.
675
762
  # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
676
763
  # assert self.speculative_num_steps < self.speculative_num_draft_tokens
@@ -741,6 +828,12 @@ class ServerArgs:
741
828
  default=ServerArgs.tokenizer_path,
742
829
  help="The path of the tokenizer.",
743
830
  )
831
+ parser.add_argument(
832
+ "--tokenizer-worker-num",
833
+ type=int,
834
+ default=ServerArgs.tokenizer_worker_num,
835
+ help="The worker num of the tokenizer manager.",
836
+ )
744
837
  parser.add_argument(
745
838
  "--tokenizer-mode",
746
839
  type=str,
@@ -759,18 +852,7 @@ class ServerArgs:
759
852
  "--load-format",
760
853
  type=str,
761
854
  default=ServerArgs.load_format,
762
- choices=[
763
- "auto",
764
- "pt",
765
- "safetensors",
766
- "npcache",
767
- "dummy",
768
- "sharded_state",
769
- "gguf",
770
- "bitsandbytes",
771
- "layered",
772
- "remote",
773
- ],
855
+ choices=LOAD_FORMAT_CHOICES,
774
856
  help="The format of the model weights to load. "
775
857
  '"auto" will try to load the weights in the safetensors format '
776
858
  "and fall back to the pytorch bin format if safetensors format "
@@ -889,25 +971,7 @@ class ServerArgs:
889
971
  "--quantization",
890
972
  type=str,
891
973
  default=ServerArgs.quantization,
892
- choices=[
893
- "awq",
894
- "fp8",
895
- "gptq",
896
- "marlin",
897
- "gptq_marlin",
898
- "awq_marlin",
899
- "bitsandbytes",
900
- "gguf",
901
- "modelopt",
902
- "modelopt_fp4",
903
- "petit_nvfp4",
904
- "w8a8_int8",
905
- "w8a8_fp8",
906
- "moe_wna16",
907
- "qoq",
908
- "w4afp8",
909
- "mxfp4",
910
- ],
974
+ choices=QUANTIZATION_CHOICES,
911
975
  help="The quantization method.",
912
976
  )
913
977
  parser.add_argument(
@@ -1357,43 +1421,24 @@ class ServerArgs:
1357
1421
  )
1358
1422
 
1359
1423
  # Kernel backend
1360
- ATTN_BACKENDS = [
1361
- # Common
1362
- "triton",
1363
- "torch_native",
1364
- # NVIDIA specific
1365
- "cutlass_mla",
1366
- "fa3",
1367
- "flashinfer",
1368
- "flashmla",
1369
- "trtllm_mla",
1370
- "trtllm_mha",
1371
- "dual_chunk_flash_attn",
1372
- # AMD specific
1373
- "aiter",
1374
- "wave",
1375
- # Other platforms
1376
- "intel_amx",
1377
- "ascend",
1378
- ]
1379
1424
  parser.add_argument(
1380
1425
  "--attention-backend",
1381
1426
  type=str,
1382
- choices=ATTN_BACKENDS,
1427
+ choices=ATTENTION_BACKEND_CHOICES,
1383
1428
  default=ServerArgs.attention_backend,
1384
1429
  help="Choose the kernels for attention layers.",
1385
1430
  )
1386
1431
  parser.add_argument(
1387
1432
  "--prefill-attention-backend",
1388
1433
  type=str,
1389
- choices=ATTN_BACKENDS,
1434
+ choices=ATTENTION_BACKEND_CHOICES,
1390
1435
  default=ServerArgs.prefill_attention_backend,
1391
1436
  help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
1392
1437
  )
1393
1438
  parser.add_argument(
1394
1439
  "--decode-attention-backend",
1395
1440
  type=str,
1396
- choices=ATTN_BACKENDS,
1441
+ choices=ATTENTION_BACKEND_CHOICES,
1397
1442
  default=ServerArgs.decode_attention_backend,
1398
1443
  help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
1399
1444
  )
@@ -1558,6 +1603,12 @@ class ServerArgs:
1558
1603
  default=ServerArgs.eplb_rebalance_layers_per_chunk,
1559
1604
  help="Number of layers to rebalance per forward pass.",
1560
1605
  )
1606
+ parser.add_argument(
1607
+ "--eplb-min-rebalancing-utilization-threshold",
1608
+ type=float,
1609
+ default=ServerArgs.eplb_min_rebalancing_utilization_threshold,
1610
+ help="Minimum threshold for GPU average utilization to trigger EPLB rebalancing. Must be in the range [0.0, 1.0].",
1611
+ )
1561
1612
  parser.add_argument(
1562
1613
  "--expert-distribution-recorder-mode",
1563
1614
  type=str,
@@ -1641,6 +1692,12 @@ class ServerArgs:
1641
1692
  default=ServerArgs.hicache_storage_prefetch_policy,
1642
1693
  help="Control when prefetching from the storage backend should stop.",
1643
1694
  )
1695
+ parser.add_argument(
1696
+ "--hicache-storage-backend-extra-config",
1697
+ type=str,
1698
+ default=ServerArgs.hicache_storage_backend_extra_config,
1699
+ help="A dictionary in JSON string format containing extra configuration for the storage backend.",
1700
+ )
1644
1701
 
1645
1702
  # Double Sparsity
1646
1703
  parser.add_argument(
@@ -1951,7 +2008,7 @@ class ServerArgs:
1951
2008
  "--disaggregation-transfer-backend",
1952
2009
  type=str,
1953
2010
  default=ServerArgs.disaggregation_transfer_backend,
1954
- choices=["mooncake", "nixl", "ascend"],
2011
+ choices=DISAGG_TRANSFER_BACKEND_CHOICES,
1955
2012
  help="The backend for disaggregation transfer. Default is mooncake.",
1956
2013
  )
1957
2014
  parser.add_argument(
@@ -2126,6 +2183,9 @@ class ServerArgs:
2126
2183
  self.chunked_prefill_size % self.page_size == 0
2127
2184
  ), "chunked_prefill_size must be divisible by page_size"
2128
2185
 
2186
+ # Check multi tokenizer
2187
+ assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
2188
+
2129
2189
  def check_lora_server_args(self):
2130
2190
  assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
2131
2191
 
@@ -2271,6 +2331,7 @@ class ServerArgs:
2271
2331
  if is_mxfp4_quant_format:
2272
2332
  # use bf16 for mxfp4 triton kernels
2273
2333
  self.dtype = "bfloat16"
2334
+
2274
2335
  elif "Llama4" in model_arch:
2275
2336
  assert self.attention_backend in {
2276
2337
  "fa3",
@@ -2368,6 +2429,9 @@ class PortArgs:
2368
2429
  # The ipc filename for Scheduler to send metrics
2369
2430
  metrics_ipc_name: str
2370
2431
 
2432
+ # The ipc filename for Tokenizer and worker tokenizer
2433
+ tokenizer_worker_ipc_name: Optional[str]
2434
+
2371
2435
  @staticmethod
2372
2436
  def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
2373
2437
  if server_args.nccl_port is None:
@@ -2391,6 +2455,7 @@ class PortArgs:
2391
2455
  nccl_port=nccl_port,
2392
2456
  rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
2393
2457
  metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
2458
+ tokenizer_worker_ipc_name=None,
2394
2459
  )
2395
2460
  else:
2396
2461
  # DP attention. Use TCP + port to handle both single-node and multi-node.
@@ -2424,6 +2489,7 @@ class PortArgs:
2424
2489
  nccl_port=nccl_port,
2425
2490
  rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
2426
2491
  metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
2492
+ tokenizer_worker_ipc_name=None,
2427
2493
  )
2428
2494
 
2429
2495
 
@@ -46,6 +46,7 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
46
46
  from sglang.srt.utils import (
47
47
  empty_context,
48
48
  get_available_gpu_memory,
49
+ get_bool_env_var,
49
50
  is_cuda,
50
51
  next_power_of_2,
51
52
  )
@@ -54,6 +55,7 @@ if is_cuda():
54
55
  from sgl_kernel import segment_packbits
55
56
 
56
57
  logger = logging.getLogger(__name__)
58
+ RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB")
57
59
 
58
60
 
59
61
  @contextmanager
@@ -137,8 +139,15 @@ class EAGLEWorker(TpModelWorker):
137
139
  embed, head = self.target_worker.model_runner.model.get_embed_and_head()
138
140
 
139
141
  if self.speculative_algorithm.is_eagle3():
140
- # EAGLE3 models don't share lm_head
141
- self.draft_model_runner.model.set_embed(embed)
142
+ # most cases EAGLE3 models don't share lm_head
143
+ # but some models (e.g. nvidia/gpt-oss-120b-Eagle3) shares
144
+ if (
145
+ hasattr(self.draft_model_runner.model, "load_lm_head_from_target")
146
+ and self.draft_model_runner.model.load_lm_head_from_target
147
+ ):
148
+ self.draft_model_runner.model.set_embed_and_head(embed, head)
149
+ else:
150
+ self.draft_model_runner.model.set_embed(embed)
142
151
 
143
152
  # grab hot token ids
144
153
  if self.draft_model_runner.model.hot_token_id is not None:
@@ -781,15 +790,20 @@ class EAGLEWorker(TpModelWorker):
781
790
  token_ids_logprobs = batch.token_ids_logprobs
782
791
  accepted_indices = res.accepted_indices
783
792
  assert len(accepted_indices) == len(logits_output.next_token_logits)
793
+
784
794
  temperatures = batch.sampling_info.temperatures
785
795
  num_draft_tokens = batch.spec_info.draft_token_num
786
796
  # acceptance indices are the indices in a "flattened" batch.
787
797
  # dividing it to num_draft_tokens will yield the actual batch index.
788
798
  temperatures = temperatures[accepted_indices // num_draft_tokens]
789
-
790
- logprobs = torch.nn.functional.log_softmax(
791
- logits_output.next_token_logits / temperatures, dim=-1
792
- )
799
+ if RETURN_ORIGINAL_LOGPROB:
800
+ logprobs = torch.nn.functional.log_softmax(
801
+ logits_output.next_token_logits, dim=-1
802
+ )
803
+ else:
804
+ logprobs = torch.nn.functional.log_softmax(
805
+ logits_output.next_token_logits / temperatures, dim=-1
806
+ )
793
807
  batch_next_token_ids = res.verified_id
794
808
  num_tokens_per_req = [accept + 1 for accept in res.accept_length_per_req_cpu]
795
809
 
@@ -806,13 +820,19 @@ class EAGLEWorker(TpModelWorker):
806
820
  (
807
821
  logits_output.next_token_top_logprobs_val,
808
822
  logits_output.next_token_top_logprobs_idx,
809
- ) = get_top_logprobs(logprobs, top_logprobs_nums_repeat_interleaved)
823
+ ) = get_top_logprobs(
824
+ logprobs,
825
+ top_logprobs_nums_repeat_interleaved,
826
+ )
810
827
 
811
828
  if any(x is not None for x in token_ids_logprobs):
812
829
  (
813
830
  logits_output.next_token_token_ids_logprobs_val,
814
831
  logits_output.next_token_token_ids_logprobs_idx,
815
- ) = get_token_ids_logprobs(logprobs, token_ids_logprobs_repeat_interleaved)
832
+ ) = get_token_ids_logprobs(
833
+ logprobs,
834
+ token_ids_logprobs_repeat_interleaved,
835
+ )
816
836
 
817
837
  logits_output.next_token_logprobs = logprobs[
818
838
  torch.arange(len(batch_next_token_ids), device=batch.sampling_info.device),
@@ -121,7 +121,12 @@ class TiktokenTokenizer:
121
121
  return self.tokenizer.decode_batch(batch)
122
122
 
123
123
  def apply_chat_template(
124
- self, messages, tokenize, add_generation_prompt, tools=None
124
+ self,
125
+ messages,
126
+ tokenize,
127
+ add_generation_prompt,
128
+ tools=None,
129
+ reasoning_effort=None,
125
130
  ):
126
131
  ret = self.chat_template_jinja.render(
127
132
  messages=messages, add_generation_prompt=add_generation_prompt
sglang/srt/utils.py CHANGED
@@ -172,6 +172,20 @@ def is_blackwell():
172
172
  return torch.cuda.get_device_capability()[0] == 10
173
173
 
174
174
 
175
+ @lru_cache(maxsize=1)
176
+ def is_sm100_supported(device=None) -> bool:
177
+ return (torch.cuda.get_device_capability(device)[0] == 10) and (
178
+ torch.version.cuda >= "12.8"
179
+ )
180
+
181
+
182
+ @lru_cache(maxsize=1)
183
+ def is_sm90_supported(device=None) -> bool:
184
+ return (torch.cuda.get_device_capability(device)[0] == 9) and (
185
+ torch.version.cuda >= "12.3"
186
+ )
187
+
188
+
175
189
  _warned_bool_env_var_keys = set()
176
190
 
177
191
 
@@ -1665,9 +1679,29 @@ def direct_register_custom_op(
1665
1679
  IMPORTANT: the lifetime of the operator is tied to the lifetime of the
1666
1680
  library object. If you want to bind the operator to a different library,
1667
1681
  make sure the library object is alive when the operator is used.
1682
+
1683
+ Note: This function will silently skip registration if the operator
1684
+ with the same name is already registered to avoid RuntimeError in
1685
+ multi-engine scenarios (e.g., VERL framework).
1668
1686
  """
1669
1687
  import torch.library
1670
1688
 
1689
+ my_lib = target_lib or sglang_lib
1690
+
1691
+ # Check if operator is already registered to avoid duplicate registration
1692
+ # This is important for scenarios where multiple SGLang engines run in the same process
1693
+ try:
1694
+ # Try to access the operator to see if it's already registered
1695
+ lib_name = my_lib.m.name if hasattr(my_lib.m, "name") else "sglang"
1696
+ if hasattr(torch.ops, lib_name) and hasattr(
1697
+ getattr(torch.ops, lib_name), op_name
1698
+ ):
1699
+ # Operator already exists, skip registration
1700
+ return
1701
+ except (AttributeError, RuntimeError):
1702
+ # Operator doesn't exist, proceed with registration
1703
+ pass
1704
+
1671
1705
  if hasattr(torch.library, "infer_schema"):
1672
1706
  schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
1673
1707
  else:
@@ -1676,11 +1710,22 @@ def direct_register_custom_op(
1676
1710
 
1677
1711
  schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
1678
1712
 
1679
- my_lib = target_lib or sglang_lib
1680
- my_lib.define(op_name + schema_str)
1681
- my_lib.impl(op_name, op_func, "CUDA")
1682
- if fake_impl is not None:
1683
- my_lib._register_fake(op_name, fake_impl)
1713
+ try:
1714
+ my_lib.define(op_name + schema_str)
1715
+ my_lib.impl(op_name, op_func, "CUDA")
1716
+ if fake_impl is not None:
1717
+ my_lib._register_fake(op_name, fake_impl)
1718
+ except RuntimeError as error:
1719
+ if "Tried to register an operator" in str(e) and "multiple times" in str(e):
1720
+ # Silently ignore duplicate registration errors
1721
+ # This can happen in multi-engine scenarios
1722
+ pass
1723
+ else:
1724
+ # Re-raise other RuntimeErrors
1725
+ raise error
1726
+ except AttributeError as error:
1727
+ # Always re-raise AttributeError as it indicates missing dependencies
1728
+ raise error
1684
1729
 
1685
1730
 
1686
1731
  def set_gpu_proc_affinity(
@@ -1919,6 +1964,15 @@ def get_ip() -> str:
1919
1964
  except Exception:
1920
1965
  pass
1921
1966
 
1967
+ # try using hostname
1968
+ hostname = socket.gethostname()
1969
+ try:
1970
+ ip_addr = socket.gethostbyname(hostname)
1971
+ warnings.warn("using local ip address: {}".format(ip_addr))
1972
+ return ip_addr
1973
+ except Exception:
1974
+ pass
1975
+
1922
1976
  warnings.warn(
1923
1977
  "Failed to get the IP address, using 0.0.0.0 by default."
1924
1978
  "The value can be set by the environment variable"
@@ -2733,6 +2787,20 @@ def lru_cache_frozenset(maxsize=128):
2733
2787
  return decorator
2734
2788
 
2735
2789
 
2790
+ def get_worker_ids_from_req_rids(rids):
2791
+ if isinstance(rids, list):
2792
+ worker_ids = [int(rid.split("_")[0]) for rid in rids]
2793
+ elif isinstance(rids, str):
2794
+ worker_ids = [int(rids.split("_")[0])]
2795
+ else:
2796
+ worker_ids = []
2797
+ return worker_ids
2798
+
2799
+
2800
+ def get_origin_rid(rid):
2801
+ return rid.split("_", 1)[1] if "_" in rid else rid
2802
+
2803
+
2736
2804
  def apply_module_patch(target_module, target_function, wrappers):
2737
2805
  original_module, original_function = parse_module_path(
2738
2806
  target_module, target_function, False
@@ -208,6 +208,15 @@ class MockModelRunner:
208
208
  self.kv_cache_dtype = config["kv_cache_dtype"]
209
209
  self.page_size = config["page_size"]
210
210
 
211
+ # Server args stub - needed by attention backends
212
+ self.server_args = type(
213
+ "ServerArgs",
214
+ (),
215
+ {
216
+ "enable_dp_attention": False, # Default value for testing
217
+ },
218
+ )
219
+
211
220
  # Model-config stub with MLA attributes
212
221
  self.model_config = type(
213
222
  "ModelConfig",
@@ -833,7 +842,7 @@ class TestTRTLLMMLA(CustomTestCase):
833
842
 
834
843
  # Test workspace properties
835
844
  self.assertEqual(metadata.workspace.device.type, "cuda")
836
- self.assertEqual(metadata.workspace.dtype, torch.int8)
845
+ self.assertEqual(metadata.workspace.dtype, torch.uint8)
837
846
  self.assertGreater(
838
847
  metadata.workspace.numel(), 0, "Workspace should have non-zero size"
839
848
  )
@@ -993,8 +1002,8 @@ class TestTRTLLMMLA(CustomTestCase):
993
1002
  )
994
1003
 
995
1004
  # Verify CUDA graph buffers are allocated
996
- self.assertIsNotNone(backend.cuda_graph_kv_indices)
997
- self.assertIsNotNone(backend.cuda_graph_workspace)
1005
+ self.assertIsNotNone(backend.decode_cuda_graph_kv_indices)
1006
+ self.assertIsNotNone(backend.decode_cuda_graph_workspace)
998
1007
 
999
1008
  # Test capture metadata
1000
1009
  seq_lens = torch.full(
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.1.post2"
1
+ __version__ = "0.5.2rc0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.1.post2
3
+ Version: 0.5.2rc0
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -251,18 +251,18 @@ Requires-Dist: scipy; extra == "runtime-common"
251
251
  Requires-Dist: timm==1.0.16; extra == "runtime-common"
252
252
  Requires-Dist: tiktoken; extra == "runtime-common"
253
253
  Requires-Dist: torchao==0.9.0; extra == "runtime-common"
254
- Requires-Dist: transformers==4.55.2; extra == "runtime-common"
254
+ Requires-Dist: transformers==4.56.0; extra == "runtime-common"
255
255
  Requires-Dist: uvicorn; extra == "runtime-common"
256
256
  Requires-Dist: uvloop; extra == "runtime-common"
257
257
  Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
258
258
  Provides-Extra: srt
259
259
  Requires-Dist: sglang[runtime_common]; extra == "srt"
260
- Requires-Dist: sgl-kernel==0.3.5; extra == "srt"
260
+ Requires-Dist: sgl-kernel==0.3.7.post1; extra == "srt"
261
261
  Requires-Dist: torch==2.8.0; extra == "srt"
262
262
  Requires-Dist: torchaudio==2.8.0; extra == "srt"
263
263
  Requires-Dist: torchvision; extra == "srt"
264
264
  Requires-Dist: cuda-python; extra == "srt"
265
- Requires-Dist: flashinfer_python==0.2.14.post1; extra == "srt"
265
+ Requires-Dist: flashinfer_python==0.3.0; extra == "srt"
266
266
  Provides-Extra: blackwell
267
267
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
268
268
  Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -270,7 +270,7 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
270
270
  Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
271
271
  Requires-Dist: torchvision; extra == "blackwell"
272
272
  Requires-Dist: cuda-python; extra == "blackwell"
273
- Requires-Dist: flashinfer_python==0.2.14.post1; extra == "blackwell"
273
+ Requires-Dist: flashinfer_python==0.3.0; extra == "blackwell"
274
274
  Provides-Extra: srt-hip
275
275
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
276
276
  Requires-Dist: torch; extra == "srt-hip"
@@ -304,6 +304,7 @@ Requires-Dist: pandas; extra == "test"
304
304
  Requires-Dist: peft; extra == "test"
305
305
  Requires-Dist: sentence_transformers; extra == "test"
306
306
  Requires-Dist: pytest; extra == "test"
307
+ Requires-Dist: tabulate; extra == "test"
307
308
  Provides-Extra: all
308
309
  Requires-Dist: sglang[srt]; extra == "all"
309
310
  Requires-Dist: sglang[openai]; extra == "all"
@@ -374,7 +375,7 @@ Dynamic: license-file
374
375
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
375
376
 
376
377
  ## News
377
- - [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf)).
378
+ - [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
378
379
  - [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
379
380
  - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
380
381
  - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).