sglang 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. sglang/bench_serving.py +49 -7
  2. sglang/lang/chat_template.py +24 -0
  3. sglang/srt/_custom_ops.py +59 -92
  4. sglang/srt/configs/model_config.py +5 -0
  5. sglang/srt/constrained/base_grammar_backend.py +5 -1
  6. sglang/srt/conversation.py +29 -4
  7. sglang/srt/custom_op.py +5 -0
  8. sglang/srt/distributed/device_communicators/custom_all_reduce.py +27 -79
  9. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  10. sglang/srt/entrypoints/engine.py +0 -5
  11. sglang/srt/layers/attention/flashattention_backend.py +678 -83
  12. sglang/srt/layers/attention/flashinfer_backend.py +5 -7
  13. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
  14. sglang/srt/layers/attention/flashmla_backend.py +1 -1
  15. sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
  16. sglang/srt/layers/moe/ep_moe/layer.py +79 -80
  17. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
  18. sglang/srt/layers/moe/fused_moe_native.py +5 -0
  19. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  20. sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  21. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  22. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  23. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  24. sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  25. sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +416 -50
  30. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
  31. sglang/srt/layers/moe/topk.py +49 -3
  32. sglang/srt/layers/quantization/__init__.py +5 -1
  33. sglang/srt/layers/quantization/blockwise_int8.py +2 -0
  34. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
  35. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
  36. sglang/srt/layers/quantization/fp8.py +3 -1
  37. sglang/srt/layers/quantization/fp8_utils.py +1 -4
  38. sglang/srt/layers/quantization/moe_wna16.py +503 -0
  39. sglang/srt/layers/quantization/utils.py +1 -1
  40. sglang/srt/layers/quantization/w8a8_int8.py +2 -0
  41. sglang/srt/layers/radix_attention.py +2 -0
  42. sglang/srt/layers/rotary_embedding.py +63 -12
  43. sglang/srt/managers/cache_controller.py +34 -11
  44. sglang/srt/managers/mm_utils.py +202 -156
  45. sglang/srt/managers/multimodal_processor.py +0 -2
  46. sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
  47. sglang/srt/managers/multimodal_processors/clip.py +7 -26
  48. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
  49. sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
  50. sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
  51. sglang/srt/managers/multimodal_processors/llava.py +34 -14
  52. sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
  53. sglang/srt/managers/multimodal_processors/mlama.py +10 -23
  54. sglang/srt/managers/multimodal_processors/mllama4.py +161 -0
  55. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
  56. sglang/srt/managers/schedule_batch.py +185 -128
  57. sglang/srt/managers/scheduler.py +4 -4
  58. sglang/srt/managers/tokenizer_manager.py +1 -1
  59. sglang/srt/managers/utils.py +1 -6
  60. sglang/srt/mem_cache/hiradix_cache.py +62 -52
  61. sglang/srt/mem_cache/memory_pool.py +72 -6
  62. sglang/srt/mem_cache/paged_allocator.py +39 -0
  63. sglang/srt/metrics/collector.py +23 -53
  64. sglang/srt/model_executor/cuda_graph_runner.py +8 -6
  65. sglang/srt/model_executor/forward_batch_info.py +10 -10
  66. sglang/srt/model_executor/model_runner.py +60 -57
  67. sglang/srt/model_loader/loader.py +8 -0
  68. sglang/srt/models/clip.py +12 -7
  69. sglang/srt/models/deepseek_janus_pro.py +10 -15
  70. sglang/srt/models/deepseek_v2.py +212 -121
  71. sglang/srt/models/deepseek_vl2.py +105 -104
  72. sglang/srt/models/gemma3_mm.py +14 -80
  73. sglang/srt/models/llama.py +16 -5
  74. sglang/srt/models/llama4.py +420 -0
  75. sglang/srt/models/llava.py +31 -19
  76. sglang/srt/models/llavavid.py +16 -7
  77. sglang/srt/models/minicpmo.py +63 -147
  78. sglang/srt/models/minicpmv.py +17 -27
  79. sglang/srt/models/mllama.py +29 -14
  80. sglang/srt/models/mllama4.py +154 -0
  81. sglang/srt/models/qwen2.py +9 -6
  82. sglang/srt/models/qwen2_5_vl.py +21 -31
  83. sglang/srt/models/qwen2_vl.py +20 -21
  84. sglang/srt/openai_api/adapter.py +18 -6
  85. sglang/srt/platforms/interface.py +371 -0
  86. sglang/srt/server_args.py +99 -14
  87. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
  88. sglang/srt/speculative/eagle_utils.py +140 -28
  89. sglang/srt/speculative/eagle_worker.py +93 -24
  90. sglang/srt/utils.py +104 -51
  91. sglang/test/test_custom_ops.py +55 -0
  92. sglang/test/test_utils.py +13 -26
  93. sglang/utils.py +2 -2
  94. sglang/version.py +1 -1
  95. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/METADATA +4 -3
  96. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/RECORD +99 -84
  97. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/WHEEL +0 -0
  98. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/licenses/LICENSE +0 -0
  99. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py CHANGED
@@ -44,6 +44,12 @@ ASSISTANT_SUFFIX = "Assistant:"
44
44
  global args
45
45
 
46
46
 
47
+ # don't want to import sglang package here
48
+ def _get_bool_env_var(name: str, default: str = "false") -> bool:
49
+ value = os.getenv(name, default)
50
+ return value.lower() in ("true", "1")
51
+
52
+
47
53
  @dataclass
48
54
  class RequestFuncInput:
49
55
  prompt: str
@@ -969,6 +975,7 @@ async def benchmark(
969
975
  extra_request_body: Dict[str, Any],
970
976
  profile: bool,
971
977
  pd_seperated: bool = False,
978
+ flush_cache: bool = False,
972
979
  ):
973
980
  if backend in ASYNC_REQUEST_FUNCS:
974
981
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -986,13 +993,16 @@ async def benchmark(
986
993
  return await request_func(request_func_input=request_func_input, pbar=pbar)
987
994
 
988
995
  # Warmup
989
- print("Starting initial single prompt test run...")
996
+ print(f"Starting warmup with {args.warmup_requests} sequences...")
997
+
998
+ # Use the first request for all warmup iterations
990
999
  test_prompt, test_prompt_len, test_output_len = input_requests[0]
991
1000
  if lora_names != None and len(lora_names) != 0:
992
1001
  lora_name = lora_names[0]
993
1002
  else:
994
1003
  lora_name = None
995
1004
 
1005
+ # Create the test input once
996
1006
  test_input = RequestFuncInput(
997
1007
  model=model_id,
998
1008
  prompt=test_prompt,
@@ -1002,17 +1012,29 @@ async def benchmark(
1002
1012
  lora_name=lora_name,
1003
1013
  extra_request_body=extra_request_body,
1004
1014
  )
1005
- test_output = await request_func(request_func_input=test_input)
1006
- if not test_output.success:
1015
+
1016
+ # Run warmup requests
1017
+ warmup_tasks = []
1018
+ for _ in range(args.warmup_requests):
1019
+ warmup_tasks.append(
1020
+ asyncio.create_task(request_func(request_func_input=test_input))
1021
+ )
1022
+
1023
+ warmup_outputs = await asyncio.gather(*warmup_tasks)
1024
+
1025
+ # Check if at least one warmup request succeeded
1026
+ if not any(output.success for output in warmup_outputs):
1007
1027
  raise ValueError(
1008
- "Initial test run failed - Please make sure benchmark arguments "
1009
- f"are correctly specified. Error: {test_output.error}"
1028
+ "Warmup failed - Please make sure benchmark arguments "
1029
+ f"are correctly specified. Error: {warmup_outputs[0].error}"
1010
1030
  )
1011
1031
  else:
1012
- print("Initial test run completed. Starting main benchmark run...")
1032
+ print(
1033
+ f"Warmup completed with {args.warmup_requests} sequences. Starting main benchmark run..."
1034
+ )
1013
1035
 
1014
1036
  # Flush cache
1015
- if "sglang" in backend:
1037
+ if ("sglang" in backend and _get_bool_env_var("SGLANG_IS_IN_CI")) or flush_cache:
1016
1038
  requests.post(base_url + "/flush_cache", headers=get_auth_headers())
1017
1039
 
1018
1040
  time.sleep(1.0)
@@ -1246,6 +1268,10 @@ def run_benchmark(args_: argparse.Namespace):
1246
1268
  if not hasattr(args, "max_concurrency"):
1247
1269
  args.max_concurrency = None
1248
1270
 
1271
+ # Set default value for warmup_requests if not present
1272
+ if not hasattr(args, "warmup_requests"):
1273
+ args.warmup_requests = 1
1274
+
1249
1275
  print(f"benchmark_args={args}")
1250
1276
 
1251
1277
  # Set global environments
@@ -1347,6 +1373,10 @@ def run_benchmark(args_: argparse.Namespace):
1347
1373
  tokenizer = get_tokenizer(tokenizer_id)
1348
1374
  input_requests = get_dataset(args, tokenizer)
1349
1375
 
1376
+ # compatible with SimpleNamespace
1377
+ if not hasattr(args, "flush_cache"):
1378
+ args.flush_cache = False
1379
+
1350
1380
  return asyncio.run(
1351
1381
  benchmark(
1352
1382
  backend=backend,
@@ -1362,6 +1392,7 @@ def run_benchmark(args_: argparse.Namespace):
1362
1392
  extra_request_body=extra_request_body,
1363
1393
  profile=args.profile,
1364
1394
  pd_seperated=args.pd_seperated,
1395
+ flush_cache=args.flush_cache,
1365
1396
  )
1366
1397
  )
1367
1398
 
@@ -1543,6 +1574,17 @@ if __name__ == "__main__":
1543
1574
  action="store_true",
1544
1575
  help="Benchmark PD disaggregation server",
1545
1576
  )
1577
+ parser.add_argument(
1578
+ "--flush-cache",
1579
+ action="store_true",
1580
+ help="Flush the cache before running the benchmark",
1581
+ )
1582
+ parser.add_argument(
1583
+ "--warmup-requests",
1584
+ type=int,
1585
+ default=1,
1586
+ help="Number of warmup requests to run before the benchmark",
1587
+ )
1546
1588
 
1547
1589
  group = parser.add_argument_group("generated-shared-prefix dataset arguments")
1548
1590
  group.add_argument(
@@ -294,6 +294,30 @@ register_chat_template(
294
294
  )
295
295
  )
296
296
 
297
+ # Reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
298
+ register_chat_template(
299
+ ChatTemplate(
300
+ name="llama-4",
301
+ default_system_prompt=None,
302
+ role_prefix_and_suffix={
303
+ "system": (
304
+ "<|header_start|>system<|header_end|>\n\n",
305
+ "<|eot|>",
306
+ ),
307
+ "user": (
308
+ "<|header_start|>user<|header_end|>\n\n",
309
+ "<|eot|>",
310
+ ),
311
+ "assistant": (
312
+ "<|header_start|>assistant<|header_end|>\n\n",
313
+ "<|eot|>",
314
+ ),
315
+ },
316
+ stop_str=("<|eot|>",),
317
+ image_token="<|image|>",
318
+ )
319
+ )
320
+
297
321
  # Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
298
322
  register_chat_template(
299
323
  ChatTemplate(
sglang/srt/_custom_ops.py CHANGED
@@ -27,17 +27,20 @@ if not is_hpu():
27
27
  logger.warning("Failed to import from custom_ar with %r", e)
28
28
 
29
29
 
30
- if use_vllm_custom_allreduce and not is_hip():
31
- # vLLM custom allreduce
30
+ if not is_hip():
31
+ if use_vllm_custom_allreduce:
32
+ custom_op = torch.ops._C_custom_ar
33
+ else:
34
+ custom_op = sgl_kernel.allreduce
35
+
36
+ # custom allreduce
32
37
  def init_custom_ar(
33
38
  ipc_tensors: List[torch.Tensor],
34
39
  rank_data: torch.Tensor,
35
40
  rank: int,
36
41
  full_nvlink: bool,
37
42
  ) -> int:
38
- return torch.ops._C_custom_ar.init_custom_ar(
39
- ipc_tensors, rank_data, rank, full_nvlink
40
- )
43
+ return custom_op.init_custom_ar(ipc_tensors, rank_data, rank, full_nvlink)
41
44
 
42
45
  def all_reduce(
43
46
  fa: int,
@@ -46,105 +49,69 @@ if use_vllm_custom_allreduce and not is_hip():
46
49
  reg_buffer: int,
47
50
  reg_buffer_sz_bytes: int,
48
51
  ) -> None:
49
- torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
52
+ custom_op.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
50
53
 
51
54
  def dispose(fa: int) -> None:
52
- torch.ops._C_custom_ar.dispose(fa)
55
+ custom_op.dispose(fa)
53
56
 
54
57
  def meta_size() -> int:
55
- return torch.ops._C_custom_ar.meta_size()
58
+ return custom_op.meta_size()
56
59
 
57
60
  def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
58
- return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors)
61
+ return custom_op.register_buffer(fa, ipc_tensors)
59
62
 
60
63
  def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
61
- return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
64
+ return custom_op.get_graph_buffer_ipc_meta(fa)
62
65
 
63
66
  def register_graph_buffers(
64
67
  fa: int, handles: List[List[int]], offsets: List[List[int]]
65
68
  ) -> None:
66
- torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
69
+ custom_op.register_graph_buffers(fa, handles, offsets)
67
70
 
68
71
  else:
69
- if is_hip():
70
- # ROCM custom allreduce
71
-
72
- def init_custom_ar(
73
- meta: torch.Tensor,
74
- rank_data: torch.Tensor,
75
- handles: List[str],
76
- offsets: List[int],
77
- rank: int,
78
- full_nvlink: bool,
79
- ) -> int:
80
- return sgl_kernel.allreduce.init_custom_ar(
81
- meta, rank_data, handles, offsets, rank, full_nvlink
82
- )
83
-
84
- def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
85
- sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
86
-
87
- def all_reduce_unreg(
88
- fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
89
- ) -> None:
90
- sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
91
-
92
- def dispose(fa: int) -> None:
93
- sgl_kernel.allreduce.dispose(fa)
94
-
95
- def meta_size() -> int:
96
- return sgl_kernel.allreduce.meta_size()
97
-
98
- def register_buffer(
99
- fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
100
- ) -> None:
101
- return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
102
-
103
- def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
104
- return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
105
-
106
- def register_graph_buffers(
107
- fa: int, handles: List[str], offsets: List[List[int]]
108
- ) -> None:
109
- sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
110
-
111
- def allocate_meta_buffer(size: int) -> torch.Tensor:
112
- return sgl_kernel.allreduce.allocate_meta_buffer(size)
113
-
114
- def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
115
- return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
72
+ # ROCM custom allreduce
116
73
 
117
- else:
118
- # TRTLLM custom allreduce
119
- def init_custom_ar(
120
- rank_id: int,
121
- world_size: int,
122
- rank_data_base: torch.Tensor,
123
- buffers: List[int],
124
- tmp_result_buffers: List[int],
125
- barrier_in: List[int],
126
- barrier_out: List[int],
127
- ) -> int:
128
- return sgl_kernel.init_custom_reduce(
129
- rank_id,
130
- world_size,
131
- rank_data_base,
132
- buffers,
133
- tmp_result_buffers,
134
- barrier_in,
135
- barrier_out,
136
- )
137
-
138
- def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
139
- sgl_kernel.custom_reduce(fa, inp, out)
140
-
141
- def dispose(fa: int) -> None:
142
- sgl_kernel.custom_dispose(fa)
143
-
144
- def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
145
- return sgl_kernel.get_graph_buffer_ipc_meta(fa)
146
-
147
- def register_graph_buffers(
148
- fa: int, handles: List[List[int]], offsets: List[List[int]]
149
- ) -> None:
150
- sgl_kernel.register_graph_buffers(fa, handles, offsets)
74
+ def init_custom_ar(
75
+ meta: torch.Tensor,
76
+ rank_data: torch.Tensor,
77
+ handles: List[str],
78
+ offsets: List[int],
79
+ rank: int,
80
+ full_nvlink: bool,
81
+ ) -> int:
82
+ return sgl_kernel.allreduce.init_custom_ar(
83
+ meta, rank_data, handles, offsets, rank, full_nvlink
84
+ )
85
+
86
+ def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
87
+ sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
88
+
89
+ def all_reduce_unreg(
90
+ fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
91
+ ) -> None:
92
+ sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
93
+
94
+ def dispose(fa: int) -> None:
95
+ sgl_kernel.allreduce.dispose(fa)
96
+
97
+ def meta_size() -> int:
98
+ return sgl_kernel.allreduce.meta_size()
99
+
100
+ def register_buffer(
101
+ fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
102
+ ) -> None:
103
+ return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
104
+
105
+ def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
106
+ return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
107
+
108
+ def register_graph_buffers(
109
+ fa: int, handles: List[str], offsets: List[List[int]]
110
+ ) -> None:
111
+ sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
112
+
113
+ def allocate_meta_buffer(size: int) -> torch.Tensor:
114
+ return sgl_kernel.allreduce.allocate_meta_buffer(size)
115
+
116
+ def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
117
+ return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
@@ -65,6 +65,9 @@ class ModelConfig:
65
65
  **kwargs,
66
66
  )
67
67
  self.hf_text_config = get_hf_text_config(self.hf_config)
68
+ self.attention_chunk_size = getattr(
69
+ self.hf_text_config, "attention_chunk_size", None
70
+ )
68
71
 
69
72
  # Check model type
70
73
  self.is_generation = is_generation_model(
@@ -258,6 +261,7 @@ class ModelConfig:
258
261
  "experts_int8",
259
262
  "w8a8_int8",
260
263
  "w8a8_fp8",
264
+ "moe_wna16",
261
265
  ]
262
266
  compatible_quantization_methods = {
263
267
  "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
@@ -466,6 +470,7 @@ multimodal_model_archs = [
466
470
  "Gemma3ForConditionalGeneration",
467
471
  "Grok1VForCausalLM",
468
472
  "Grok1AForCausalLM",
473
+ # TODO: add multimodal support for "Llama4ForConditionalGeneration",
469
474
  "LlavaLlamaForCausalLM",
470
475
  "LlavaMistralForCausalLM",
471
476
  "LlavaQwenForCausalLM",
@@ -169,7 +169,9 @@ class BaseGrammarBackend(ABC):
169
169
  self.cache.clear()
170
170
 
171
171
 
172
- def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
172
+ def create_grammar_backend(
173
+ server_args: ServerArgs, tokenizer, vocab_size: int
174
+ ) -> Optional[BaseGrammarBackend]:
173
175
  if server_args.grammar_backend == "outlines":
174
176
  from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend
175
177
 
@@ -188,6 +190,8 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
188
190
  tokenizer=tokenizer,
189
191
  whitespace_pattern=server_args.constrained_json_whitespace_pattern,
190
192
  )
193
+ elif server_args.grammar_backend == "none":
194
+ return None
191
195
  else:
192
196
  raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
193
197
 
@@ -33,6 +33,7 @@ class SeparatorStyle(IntEnum):
33
33
  ADD_NEW_LINE_SINGLE = auto()
34
34
  LLAMA2 = auto()
35
35
  LLAMA3 = auto()
36
+ LLAMA4 = auto()
36
37
  CHATGLM = auto()
37
38
  CHATML = auto()
38
39
  CHATINTERN = auto()
@@ -156,19 +157,30 @@ class Conversation:
156
157
  else:
157
158
  ret += role + ":"
158
159
  return ret
160
+ elif self.sep_style == SeparatorStyle.LLAMA4:
161
+ # begin_of_text is added by default
162
+ if self.system_message:
163
+ ret = system_prompt
164
+ else:
165
+ ret = ""
166
+ for i, (role, message) in enumerate(self.messages):
167
+ if message:
168
+ ret += f"<|header_start|>{role}<|header_end|>\n\n"
169
+ ret += f"{message.strip()}<|eot|>"
170
+ else:
171
+ ret += f"<|header_start|>{role}<|header_end|>\n\n"
172
+ return ret
159
173
  elif self.sep_style == SeparatorStyle.LLAMA3:
160
- ret = "<|begin_of_text|>"
161
174
  if self.system_message:
162
- ret += system_prompt
175
+ ret = system_prompt
163
176
  else:
164
- ret += ""
177
+ ret = ""
165
178
  for i, (role, message) in enumerate(self.messages):
166
179
  if message:
167
180
  ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
168
181
  ret += f"{message.strip()}<|eot_id|>"
169
182
  else:
170
183
  ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
171
- # print(ret)
172
184
  return ret
173
185
  elif self.sep_style == SeparatorStyle.LLAMA2:
174
186
  seps = [self.sep, self.sep2]
@@ -561,6 +573,19 @@ register_conv_template(
561
573
  )
562
574
  )
563
575
 
576
+ # reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
577
+ register_conv_template(
578
+ Conversation(
579
+ name="llama-4",
580
+ system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
581
+ roles=("user", "assistant"),
582
+ sep_style=SeparatorStyle.LLAMA4,
583
+ sep="",
584
+ stop_str=["<|end_of_text|>", "<|eot|>", "<|eom|>"],
585
+ image_token="<|image|>",
586
+ )
587
+ )
588
+
564
589
  register_conv_template(
565
590
  Conversation(
566
591
  name="chatml",
sglang/srt/custom_op.py CHANGED
@@ -50,6 +50,7 @@ if _is_cuda:
50
50
  def scaled_fp8_quant(
51
51
  input: torch.Tensor,
52
52
  scale: Optional[torch.Tensor] = None,
53
+ num_token_padding: Optional[int] = None,
53
54
  use_per_token_if_dynamic: bool = False,
54
55
  ) -> tuple[torch.Tensor, torch.Tensor]:
55
56
  """
@@ -59,6 +60,8 @@ if _is_cuda:
59
60
  input (torch.Tensor): Input tensor to be quantized
60
61
  scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
61
62
  If None, scales will be computed dynamically.
63
+ num_token_padding (Optional[int]): If specified, pad the first dimension
64
+ of the output to at least this value.
62
65
  use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
63
66
  determines the quantization granularity:
64
67
  - True: compute scale per token
@@ -75,6 +78,8 @@ if _is_cuda:
75
78
  assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
76
79
  shape = input.shape
77
80
  out_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
81
+ if num_token_padding:
82
+ shape = (max(num_token_padding, input.shape[0]), shape[1])
78
83
  output = torch.empty(shape, device=input.device, dtype=out_dtype)
79
84
 
80
85
  if scale is None:
@@ -18,7 +18,7 @@ from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import
18
18
  gpu_p2p_access_check,
19
19
  )
20
20
  from sglang.srt.distributed.parallel_state import in_the_same_node_as
21
- from sglang.srt.utils import cuda_device_count_stateless, is_cuda, is_hip
21
+ from sglang.srt.utils import is_cuda, is_hip
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
@@ -217,7 +217,7 @@ class CustomAllreduce:
217
217
  if cuda_visible_devices:
218
218
  device_ids = list(map(int, cuda_visible_devices.split(",")))
219
219
  else:
220
- device_ids = list(range(cuda_device_count_stateless()))
220
+ device_ids = list(range(torch.cuda.device_count()))
221
221
 
222
222
  physical_device_id = device_ids[device.index]
223
223
  tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
@@ -257,7 +257,7 @@ class CustomAllreduce:
257
257
  self.world_size = world_size
258
258
  self.full_nvlink = full_nvlink
259
259
 
260
- if ops.use_vllm_custom_allreduce and not _is_hip:
260
+ if not _is_hip:
261
261
  # Buffers memory are owned by this Python class and passed to C++.
262
262
  # Meta data composes of two parts: meta data for synchronization and a
263
263
  # temporary buffer for storing intermediate allreduce results.
@@ -280,56 +280,24 @@ class CustomAllreduce:
280
280
  )
281
281
  ops.register_buffer(self._ptr, self.buffer_ptrs)
282
282
  else:
283
- if _is_hip:
284
- # meta data buffers need to be "uncached" for signal on MI200
285
- self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
286
- self.buffer = torch.empty(
287
- max_size, dtype=torch.uint8, device=self.device
288
- )
289
- handle = ops.get_meta_buffer_ipc_handle(self.meta)
290
- shard_data = (
291
- bytes(handle), # ipc handle to base ptr
292
- 0, # offset of base ptr
293
- )
294
- handles, offsets = self._gather_ipc_meta(shard_data)
295
- self.rank_data = torch.empty(
296
- 8 * 1024 * 1024, dtype=torch.uint8, device=self.device
297
- )
298
- self._ptr = ops.init_custom_ar(
299
- self.meta, self.rank_data, handles, offsets, rank, self.full_nvlink
300
- )
301
- self.register_buffer(self.buffer)
302
- self.MSCCL = os.getenv("RCCL_MSCCL_ENABLE", "1") == "1"
303
- else:
304
- # From TensorRT-LLM getMaxRequiredWorkspaceSize
305
- self.max_required_workspace_size = [16 * 1024 * 1024, 8 * 1024 * 1024]
306
-
307
- # sizeof(uint32_t) * (MAX_ALL_REDUCE_BLOCKS + 2) * MAX_RANKS_PER_NODE;
308
- self.barrier_max_size = 8 * (36 + 2) * 8
309
-
310
- self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
311
- self.tmp_result_buffer_ptrs = self.create_shared_buffer(
312
- max_size, group=group
313
- )
314
- self.rank_data_base = torch.empty(
315
- 8 * 1024 * 1024, dtype=torch.uint8, device=self.device
316
- )
317
- self.barrier_in_ptrs = self.create_shared_buffer(
318
- self.barrier_max_size, group=group
319
- )
320
- self.barrier_out_ptrs = self.create_shared_buffer(
321
- self.barrier_max_size, group=group
322
- )
283
+ # meta data buffers need to be "uncached" for signal on MI200
284
+ self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
285
+ self.buffer = torch.empty(max_size, dtype=torch.uint8, device=self.device)
286
+ handle = ops.get_meta_buffer_ipc_handle(self.meta)
287
+ shard_data = (
288
+ bytes(handle), # ipc handle to base ptr
289
+ 0, # offset of base ptr
290
+ )
291
+ handles, offsets = self._gather_ipc_meta(shard_data)
292
+ self.rank_data = torch.empty(
293
+ 8 * 1024 * 1024, dtype=torch.uint8, device=self.device
294
+ )
295
+ self._ptr = ops.init_custom_ar(
296
+ self.meta, self.rank_data, handles, offsets, rank, self.full_nvlink
297
+ )
298
+ self.register_buffer(self.buffer)
299
+ self.MSCCL = os.getenv("RCCL_MSCCL_ENABLE", "1") == "1"
323
300
 
324
- self._ptr = ops.init_custom_ar(
325
- rank,
326
- world_size,
327
- self.rank_data_base,
328
- self.buffer_ptrs,
329
- self.tmp_result_buffer_ptrs,
330
- self.barrier_in_ptrs,
331
- self.barrier_out_ptrs,
332
- )
333
301
  self.disabled = False
334
302
 
335
303
  @staticmethod
@@ -455,7 +423,7 @@ class CustomAllreduce:
455
423
  return False
456
424
  # for 4 or more non NVLink-capable GPUs, custom allreduce provides
457
425
  # little performance improvement over NCCL.
458
- if ops.use_vllm_custom_allreduce and not _is_hip:
426
+ if not _is_hip:
459
427
  if self.world_size == 2 or self.full_nvlink:
460
428
  return inp_size < self.max_size
461
429
  return False
@@ -471,18 +439,6 @@ class CustomAllreduce:
471
439
  return inp_size < self.max_size
472
440
  return False
473
441
 
474
- if self.world_size == 2:
475
- return (
476
- inp_size < self.max_size
477
- and inp_size < self.max_required_workspace_size[0]
478
- )
479
-
480
- if self.full_nvlink:
481
- return (
482
- inp_size < self.max_size
483
- and inp_size < self.max_required_workspace_size[1]
484
- )
485
-
486
442
  return False
487
443
 
488
444
  # all reduce, assuming inp tensor is IPC registered with register_buffer,
@@ -515,15 +471,12 @@ class CustomAllreduce:
515
471
  """
516
472
  if out is None:
517
473
  out = torch.empty_like(inp)
518
- if ops.use_vllm_custom_allreduce:
519
- if registered:
520
- ops.all_reduce(self._ptr, inp, out, 0, 0)
521
- else:
522
- ops.all_reduce(
523
- self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
524
- )
474
+ if registered:
475
+ ops.all_reduce(self._ptr, inp, out, 0, 0)
525
476
  else:
526
- ops.all_reduce(self._ptr, inp, out)
477
+ ops.all_reduce(
478
+ self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
479
+ )
527
480
  return out
528
481
 
529
482
  def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
@@ -554,14 +507,9 @@ class CustomAllreduce:
554
507
  def close(self):
555
508
  if not self.disabled and self._ptr:
556
509
  ops.dispose(self._ptr)
557
- if ops.use_vllm_custom_allreduce:
510
+ if _is_cuda:
558
511
  self.free_shared_buffer(self.meta_ptrs)
559
512
  self.free_shared_buffer(self.buffer_ptrs)
560
- elif _is_cuda:
561
- self.free_shared_buffer(self.buffer_ptrs)
562
- self.free_shared_buffer(self.tmp_result_buffer_ptrs)
563
- self.free_shared_buffer(self.barrier_in_ptrs)
564
- self.free_shared_buffer(self.barrier_out_ptrs)
565
513
  self._ptr = 0
566
514
 
567
515
  def __del__(self):
@@ -11,11 +11,11 @@ import tempfile
11
11
  from itertools import product
12
12
  from typing import Dict, List, Optional, Sequence
13
13
 
14
+ import torch
14
15
  import torch.distributed as dist
15
16
  import torch.multiprocessing as mp
16
17
 
17
18
  from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
18
- from sglang.srt.utils import cuda_device_count_stateless
19
19
 
20
20
  logger = logging.getLogger(__name__)
21
21
 
@@ -218,7 +218,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
218
218
 
219
219
  is_distributed = dist.is_initialized()
220
220
 
221
- num_dev = cuda_device_count_stateless()
221
+ num_dev = torch.cuda.device_count()
222
222
  cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
223
223
  if cuda_visible_devices is None:
224
224
  cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
@@ -151,10 +151,6 @@ class Engine:
151
151
  The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
152
152
  Please refer to `GenerateReqInput` for the documentation.
153
153
  """
154
- modalities_list = []
155
- if image_data is not None:
156
- modalities_list.append("image")
157
-
158
154
  obj = GenerateReqInput(
159
155
  text=prompt,
160
156
  input_ids=input_ids,
@@ -165,7 +161,6 @@ class Engine:
165
161
  top_logprobs_num=top_logprobs_num,
166
162
  token_ids_logprob=token_ids_logprob,
167
163
  lora_path=lora_path,
168
- modalities=modalities_list,
169
164
  custom_logit_processor=custom_logit_processor,
170
165
  return_hidden_states=return_hidden_states,
171
166
  stream=stream,