sglang 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +49 -7
- sglang/lang/chat_template.py +24 -0
- sglang/srt/_custom_ops.py +59 -92
- sglang/srt/configs/model_config.py +5 -0
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/conversation.py +29 -4
- sglang/srt/custom_op.py +5 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +27 -79
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/entrypoints/engine.py +0 -5
- sglang/srt/layers/attention/flashattention_backend.py +678 -83
- sglang/srt/layers/attention/flashinfer_backend.py +5 -7
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
- sglang/srt/layers/attention/flashmla_backend.py +1 -1
- sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
- sglang/srt/layers/moe/ep_moe/layer.py +79 -80
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
- sglang/srt/layers/moe/fused_moe_native.py +5 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +416 -50
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
- sglang/srt/layers/moe/topk.py +49 -3
- sglang/srt/layers/quantization/__init__.py +5 -1
- sglang/srt/layers/quantization/blockwise_int8.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
- sglang/srt/layers/quantization/fp8.py +3 -1
- sglang/srt/layers/quantization/fp8_utils.py +1 -4
- sglang/srt/layers/quantization/moe_wna16.py +503 -0
- sglang/srt/layers/quantization/utils.py +1 -1
- sglang/srt/layers/quantization/w8a8_int8.py +2 -0
- sglang/srt/layers/radix_attention.py +2 -0
- sglang/srt/layers/rotary_embedding.py +63 -12
- sglang/srt/managers/cache_controller.py +34 -11
- sglang/srt/managers/mm_utils.py +202 -156
- sglang/srt/managers/multimodal_processor.py +0 -2
- sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
- sglang/srt/managers/multimodal_processors/clip.py +7 -26
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
- sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
- sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
- sglang/srt/managers/multimodal_processors/llava.py +34 -14
- sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
- sglang/srt/managers/multimodal_processors/mlama.py +10 -23
- sglang/srt/managers/multimodal_processors/mllama4.py +161 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
- sglang/srt/managers/schedule_batch.py +185 -128
- sglang/srt/managers/scheduler.py +4 -4
- sglang/srt/managers/tokenizer_manager.py +1 -1
- sglang/srt/managers/utils.py +1 -6
- sglang/srt/mem_cache/hiradix_cache.py +62 -52
- sglang/srt/mem_cache/memory_pool.py +72 -6
- sglang/srt/mem_cache/paged_allocator.py +39 -0
- sglang/srt/metrics/collector.py +23 -53
- sglang/srt/model_executor/cuda_graph_runner.py +8 -6
- sglang/srt/model_executor/forward_batch_info.py +10 -10
- sglang/srt/model_executor/model_runner.py +60 -57
- sglang/srt/model_loader/loader.py +8 -0
- sglang/srt/models/clip.py +12 -7
- sglang/srt/models/deepseek_janus_pro.py +10 -15
- sglang/srt/models/deepseek_v2.py +212 -121
- sglang/srt/models/deepseek_vl2.py +105 -104
- sglang/srt/models/gemma3_mm.py +14 -80
- sglang/srt/models/llama.py +16 -5
- sglang/srt/models/llama4.py +420 -0
- sglang/srt/models/llava.py +31 -19
- sglang/srt/models/llavavid.py +16 -7
- sglang/srt/models/minicpmo.py +63 -147
- sglang/srt/models/minicpmv.py +17 -27
- sglang/srt/models/mllama.py +29 -14
- sglang/srt/models/mllama4.py +154 -0
- sglang/srt/models/qwen2.py +9 -6
- sglang/srt/models/qwen2_5_vl.py +21 -31
- sglang/srt/models/qwen2_vl.py +20 -21
- sglang/srt/openai_api/adapter.py +18 -6
- sglang/srt/platforms/interface.py +371 -0
- sglang/srt/server_args.py +99 -14
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
- sglang/srt/speculative/eagle_utils.py +140 -28
- sglang/srt/speculative/eagle_worker.py +93 -24
- sglang/srt/utils.py +104 -51
- sglang/test/test_custom_ops.py +55 -0
- sglang/test/test_utils.py +13 -26
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/METADATA +4 -3
- {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/RECORD +99 -84
- {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/WHEEL +0 -0
- {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py
CHANGED
@@ -44,6 +44,12 @@ ASSISTANT_SUFFIX = "Assistant:"
|
|
44
44
|
global args
|
45
45
|
|
46
46
|
|
47
|
+
# don't want to import sglang package here
|
48
|
+
def _get_bool_env_var(name: str, default: str = "false") -> bool:
|
49
|
+
value = os.getenv(name, default)
|
50
|
+
return value.lower() in ("true", "1")
|
51
|
+
|
52
|
+
|
47
53
|
@dataclass
|
48
54
|
class RequestFuncInput:
|
49
55
|
prompt: str
|
@@ -969,6 +975,7 @@ async def benchmark(
|
|
969
975
|
extra_request_body: Dict[str, Any],
|
970
976
|
profile: bool,
|
971
977
|
pd_seperated: bool = False,
|
978
|
+
flush_cache: bool = False,
|
972
979
|
):
|
973
980
|
if backend in ASYNC_REQUEST_FUNCS:
|
974
981
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
@@ -986,13 +993,16 @@ async def benchmark(
|
|
986
993
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
987
994
|
|
988
995
|
# Warmup
|
989
|
-
print("Starting
|
996
|
+
print(f"Starting warmup with {args.warmup_requests} sequences...")
|
997
|
+
|
998
|
+
# Use the first request for all warmup iterations
|
990
999
|
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
991
1000
|
if lora_names != None and len(lora_names) != 0:
|
992
1001
|
lora_name = lora_names[0]
|
993
1002
|
else:
|
994
1003
|
lora_name = None
|
995
1004
|
|
1005
|
+
# Create the test input once
|
996
1006
|
test_input = RequestFuncInput(
|
997
1007
|
model=model_id,
|
998
1008
|
prompt=test_prompt,
|
@@ -1002,17 +1012,29 @@ async def benchmark(
|
|
1002
1012
|
lora_name=lora_name,
|
1003
1013
|
extra_request_body=extra_request_body,
|
1004
1014
|
)
|
1005
|
-
|
1006
|
-
|
1015
|
+
|
1016
|
+
# Run warmup requests
|
1017
|
+
warmup_tasks = []
|
1018
|
+
for _ in range(args.warmup_requests):
|
1019
|
+
warmup_tasks.append(
|
1020
|
+
asyncio.create_task(request_func(request_func_input=test_input))
|
1021
|
+
)
|
1022
|
+
|
1023
|
+
warmup_outputs = await asyncio.gather(*warmup_tasks)
|
1024
|
+
|
1025
|
+
# Check if at least one warmup request succeeded
|
1026
|
+
if not any(output.success for output in warmup_outputs):
|
1007
1027
|
raise ValueError(
|
1008
|
-
"
|
1009
|
-
f"are correctly specified. Error: {
|
1028
|
+
"Warmup failed - Please make sure benchmark arguments "
|
1029
|
+
f"are correctly specified. Error: {warmup_outputs[0].error}"
|
1010
1030
|
)
|
1011
1031
|
else:
|
1012
|
-
print(
|
1032
|
+
print(
|
1033
|
+
f"Warmup completed with {args.warmup_requests} sequences. Starting main benchmark run..."
|
1034
|
+
)
|
1013
1035
|
|
1014
1036
|
# Flush cache
|
1015
|
-
if "sglang" in backend:
|
1037
|
+
if ("sglang" in backend and _get_bool_env_var("SGLANG_IS_IN_CI")) or flush_cache:
|
1016
1038
|
requests.post(base_url + "/flush_cache", headers=get_auth_headers())
|
1017
1039
|
|
1018
1040
|
time.sleep(1.0)
|
@@ -1246,6 +1268,10 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1246
1268
|
if not hasattr(args, "max_concurrency"):
|
1247
1269
|
args.max_concurrency = None
|
1248
1270
|
|
1271
|
+
# Set default value for warmup_requests if not present
|
1272
|
+
if not hasattr(args, "warmup_requests"):
|
1273
|
+
args.warmup_requests = 1
|
1274
|
+
|
1249
1275
|
print(f"benchmark_args={args}")
|
1250
1276
|
|
1251
1277
|
# Set global environments
|
@@ -1347,6 +1373,10 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1347
1373
|
tokenizer = get_tokenizer(tokenizer_id)
|
1348
1374
|
input_requests = get_dataset(args, tokenizer)
|
1349
1375
|
|
1376
|
+
# compatible with SimpleNamespace
|
1377
|
+
if not hasattr(args, "flush_cache"):
|
1378
|
+
args.flush_cache = False
|
1379
|
+
|
1350
1380
|
return asyncio.run(
|
1351
1381
|
benchmark(
|
1352
1382
|
backend=backend,
|
@@ -1362,6 +1392,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1362
1392
|
extra_request_body=extra_request_body,
|
1363
1393
|
profile=args.profile,
|
1364
1394
|
pd_seperated=args.pd_seperated,
|
1395
|
+
flush_cache=args.flush_cache,
|
1365
1396
|
)
|
1366
1397
|
)
|
1367
1398
|
|
@@ -1543,6 +1574,17 @@ if __name__ == "__main__":
|
|
1543
1574
|
action="store_true",
|
1544
1575
|
help="Benchmark PD disaggregation server",
|
1545
1576
|
)
|
1577
|
+
parser.add_argument(
|
1578
|
+
"--flush-cache",
|
1579
|
+
action="store_true",
|
1580
|
+
help="Flush the cache before running the benchmark",
|
1581
|
+
)
|
1582
|
+
parser.add_argument(
|
1583
|
+
"--warmup-requests",
|
1584
|
+
type=int,
|
1585
|
+
default=1,
|
1586
|
+
help="Number of warmup requests to run before the benchmark",
|
1587
|
+
)
|
1546
1588
|
|
1547
1589
|
group = parser.add_argument_group("generated-shared-prefix dataset arguments")
|
1548
1590
|
group.add_argument(
|
sglang/lang/chat_template.py
CHANGED
@@ -294,6 +294,30 @@ register_chat_template(
|
|
294
294
|
)
|
295
295
|
)
|
296
296
|
|
297
|
+
# Reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
|
298
|
+
register_chat_template(
|
299
|
+
ChatTemplate(
|
300
|
+
name="llama-4",
|
301
|
+
default_system_prompt=None,
|
302
|
+
role_prefix_and_suffix={
|
303
|
+
"system": (
|
304
|
+
"<|header_start|>system<|header_end|>\n\n",
|
305
|
+
"<|eot|>",
|
306
|
+
),
|
307
|
+
"user": (
|
308
|
+
"<|header_start|>user<|header_end|>\n\n",
|
309
|
+
"<|eot|>",
|
310
|
+
),
|
311
|
+
"assistant": (
|
312
|
+
"<|header_start|>assistant<|header_end|>\n\n",
|
313
|
+
"<|eot|>",
|
314
|
+
),
|
315
|
+
},
|
316
|
+
stop_str=("<|eot|>",),
|
317
|
+
image_token="<|image|>",
|
318
|
+
)
|
319
|
+
)
|
320
|
+
|
297
321
|
# Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
|
298
322
|
register_chat_template(
|
299
323
|
ChatTemplate(
|
sglang/srt/_custom_ops.py
CHANGED
@@ -27,17 +27,20 @@ if not is_hpu():
|
|
27
27
|
logger.warning("Failed to import from custom_ar with %r", e)
|
28
28
|
|
29
29
|
|
30
|
-
if
|
31
|
-
|
30
|
+
if not is_hip():
|
31
|
+
if use_vllm_custom_allreduce:
|
32
|
+
custom_op = torch.ops._C_custom_ar
|
33
|
+
else:
|
34
|
+
custom_op = sgl_kernel.allreduce
|
35
|
+
|
36
|
+
# custom allreduce
|
32
37
|
def init_custom_ar(
|
33
38
|
ipc_tensors: List[torch.Tensor],
|
34
39
|
rank_data: torch.Tensor,
|
35
40
|
rank: int,
|
36
41
|
full_nvlink: bool,
|
37
42
|
) -> int:
|
38
|
-
return
|
39
|
-
ipc_tensors, rank_data, rank, full_nvlink
|
40
|
-
)
|
43
|
+
return custom_op.init_custom_ar(ipc_tensors, rank_data, rank, full_nvlink)
|
41
44
|
|
42
45
|
def all_reduce(
|
43
46
|
fa: int,
|
@@ -46,105 +49,69 @@ if use_vllm_custom_allreduce and not is_hip():
|
|
46
49
|
reg_buffer: int,
|
47
50
|
reg_buffer_sz_bytes: int,
|
48
51
|
) -> None:
|
49
|
-
|
52
|
+
custom_op.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
|
50
53
|
|
51
54
|
def dispose(fa: int) -> None:
|
52
|
-
|
55
|
+
custom_op.dispose(fa)
|
53
56
|
|
54
57
|
def meta_size() -> int:
|
55
|
-
return
|
58
|
+
return custom_op.meta_size()
|
56
59
|
|
57
60
|
def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
|
58
|
-
return
|
61
|
+
return custom_op.register_buffer(fa, ipc_tensors)
|
59
62
|
|
60
63
|
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
|
61
|
-
return
|
64
|
+
return custom_op.get_graph_buffer_ipc_meta(fa)
|
62
65
|
|
63
66
|
def register_graph_buffers(
|
64
67
|
fa: int, handles: List[List[int]], offsets: List[List[int]]
|
65
68
|
) -> None:
|
66
|
-
|
69
|
+
custom_op.register_graph_buffers(fa, handles, offsets)
|
67
70
|
|
68
71
|
else:
|
69
|
-
|
70
|
-
# ROCM custom allreduce
|
71
|
-
|
72
|
-
def init_custom_ar(
|
73
|
-
meta: torch.Tensor,
|
74
|
-
rank_data: torch.Tensor,
|
75
|
-
handles: List[str],
|
76
|
-
offsets: List[int],
|
77
|
-
rank: int,
|
78
|
-
full_nvlink: bool,
|
79
|
-
) -> int:
|
80
|
-
return sgl_kernel.allreduce.init_custom_ar(
|
81
|
-
meta, rank_data, handles, offsets, rank, full_nvlink
|
82
|
-
)
|
83
|
-
|
84
|
-
def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
|
85
|
-
sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
|
86
|
-
|
87
|
-
def all_reduce_unreg(
|
88
|
-
fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
|
89
|
-
) -> None:
|
90
|
-
sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
|
91
|
-
|
92
|
-
def dispose(fa: int) -> None:
|
93
|
-
sgl_kernel.allreduce.dispose(fa)
|
94
|
-
|
95
|
-
def meta_size() -> int:
|
96
|
-
return sgl_kernel.allreduce.meta_size()
|
97
|
-
|
98
|
-
def register_buffer(
|
99
|
-
fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
|
100
|
-
) -> None:
|
101
|
-
return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
|
102
|
-
|
103
|
-
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
|
104
|
-
return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
|
105
|
-
|
106
|
-
def register_graph_buffers(
|
107
|
-
fa: int, handles: List[str], offsets: List[List[int]]
|
108
|
-
) -> None:
|
109
|
-
sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
|
110
|
-
|
111
|
-
def allocate_meta_buffer(size: int) -> torch.Tensor:
|
112
|
-
return sgl_kernel.allreduce.allocate_meta_buffer(size)
|
113
|
-
|
114
|
-
def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
|
115
|
-
return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
|
72
|
+
# ROCM custom allreduce
|
116
73
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
)
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
)
|
150
|
-
|
74
|
+
def init_custom_ar(
|
75
|
+
meta: torch.Tensor,
|
76
|
+
rank_data: torch.Tensor,
|
77
|
+
handles: List[str],
|
78
|
+
offsets: List[int],
|
79
|
+
rank: int,
|
80
|
+
full_nvlink: bool,
|
81
|
+
) -> int:
|
82
|
+
return sgl_kernel.allreduce.init_custom_ar(
|
83
|
+
meta, rank_data, handles, offsets, rank, full_nvlink
|
84
|
+
)
|
85
|
+
|
86
|
+
def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
|
87
|
+
sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
|
88
|
+
|
89
|
+
def all_reduce_unreg(
|
90
|
+
fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
|
91
|
+
) -> None:
|
92
|
+
sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
|
93
|
+
|
94
|
+
def dispose(fa: int) -> None:
|
95
|
+
sgl_kernel.allreduce.dispose(fa)
|
96
|
+
|
97
|
+
def meta_size() -> int:
|
98
|
+
return sgl_kernel.allreduce.meta_size()
|
99
|
+
|
100
|
+
def register_buffer(
|
101
|
+
fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
|
102
|
+
) -> None:
|
103
|
+
return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
|
104
|
+
|
105
|
+
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
|
106
|
+
return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
|
107
|
+
|
108
|
+
def register_graph_buffers(
|
109
|
+
fa: int, handles: List[str], offsets: List[List[int]]
|
110
|
+
) -> None:
|
111
|
+
sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
|
112
|
+
|
113
|
+
def allocate_meta_buffer(size: int) -> torch.Tensor:
|
114
|
+
return sgl_kernel.allreduce.allocate_meta_buffer(size)
|
115
|
+
|
116
|
+
def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
|
117
|
+
return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
|
@@ -65,6 +65,9 @@ class ModelConfig:
|
|
65
65
|
**kwargs,
|
66
66
|
)
|
67
67
|
self.hf_text_config = get_hf_text_config(self.hf_config)
|
68
|
+
self.attention_chunk_size = getattr(
|
69
|
+
self.hf_text_config, "attention_chunk_size", None
|
70
|
+
)
|
68
71
|
|
69
72
|
# Check model type
|
70
73
|
self.is_generation = is_generation_model(
|
@@ -258,6 +261,7 @@ class ModelConfig:
|
|
258
261
|
"experts_int8",
|
259
262
|
"w8a8_int8",
|
260
263
|
"w8a8_fp8",
|
264
|
+
"moe_wna16",
|
261
265
|
]
|
262
266
|
compatible_quantization_methods = {
|
263
267
|
"w8a8_int8": ["compressed-tensors", "compressed_tensors"],
|
@@ -466,6 +470,7 @@ multimodal_model_archs = [
|
|
466
470
|
"Gemma3ForConditionalGeneration",
|
467
471
|
"Grok1VForCausalLM",
|
468
472
|
"Grok1AForCausalLM",
|
473
|
+
# TODO: add multimodal support for "Llama4ForConditionalGeneration",
|
469
474
|
"LlavaLlamaForCausalLM",
|
470
475
|
"LlavaMistralForCausalLM",
|
471
476
|
"LlavaQwenForCausalLM",
|
@@ -169,7 +169,9 @@ class BaseGrammarBackend(ABC):
|
|
169
169
|
self.cache.clear()
|
170
170
|
|
171
171
|
|
172
|
-
def create_grammar_backend(
|
172
|
+
def create_grammar_backend(
|
173
|
+
server_args: ServerArgs, tokenizer, vocab_size: int
|
174
|
+
) -> Optional[BaseGrammarBackend]:
|
173
175
|
if server_args.grammar_backend == "outlines":
|
174
176
|
from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend
|
175
177
|
|
@@ -188,6 +190,8 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
|
|
188
190
|
tokenizer=tokenizer,
|
189
191
|
whitespace_pattern=server_args.constrained_json_whitespace_pattern,
|
190
192
|
)
|
193
|
+
elif server_args.grammar_backend == "none":
|
194
|
+
return None
|
191
195
|
else:
|
192
196
|
raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
|
193
197
|
|
sglang/srt/conversation.py
CHANGED
@@ -33,6 +33,7 @@ class SeparatorStyle(IntEnum):
|
|
33
33
|
ADD_NEW_LINE_SINGLE = auto()
|
34
34
|
LLAMA2 = auto()
|
35
35
|
LLAMA3 = auto()
|
36
|
+
LLAMA4 = auto()
|
36
37
|
CHATGLM = auto()
|
37
38
|
CHATML = auto()
|
38
39
|
CHATINTERN = auto()
|
@@ -156,19 +157,30 @@ class Conversation:
|
|
156
157
|
else:
|
157
158
|
ret += role + ":"
|
158
159
|
return ret
|
160
|
+
elif self.sep_style == SeparatorStyle.LLAMA4:
|
161
|
+
# begin_of_text is added by default
|
162
|
+
if self.system_message:
|
163
|
+
ret = system_prompt
|
164
|
+
else:
|
165
|
+
ret = ""
|
166
|
+
for i, (role, message) in enumerate(self.messages):
|
167
|
+
if message:
|
168
|
+
ret += f"<|header_start|>{role}<|header_end|>\n\n"
|
169
|
+
ret += f"{message.strip()}<|eot|>"
|
170
|
+
else:
|
171
|
+
ret += f"<|header_start|>{role}<|header_end|>\n\n"
|
172
|
+
return ret
|
159
173
|
elif self.sep_style == SeparatorStyle.LLAMA3:
|
160
|
-
ret = "<|begin_of_text|>"
|
161
174
|
if self.system_message:
|
162
|
-
ret
|
175
|
+
ret = system_prompt
|
163
176
|
else:
|
164
|
-
ret
|
177
|
+
ret = ""
|
165
178
|
for i, (role, message) in enumerate(self.messages):
|
166
179
|
if message:
|
167
180
|
ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
|
168
181
|
ret += f"{message.strip()}<|eot_id|>"
|
169
182
|
else:
|
170
183
|
ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
|
171
|
-
# print(ret)
|
172
184
|
return ret
|
173
185
|
elif self.sep_style == SeparatorStyle.LLAMA2:
|
174
186
|
seps = [self.sep, self.sep2]
|
@@ -561,6 +573,19 @@ register_conv_template(
|
|
561
573
|
)
|
562
574
|
)
|
563
575
|
|
576
|
+
# reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
|
577
|
+
register_conv_template(
|
578
|
+
Conversation(
|
579
|
+
name="llama-4",
|
580
|
+
system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
|
581
|
+
roles=("user", "assistant"),
|
582
|
+
sep_style=SeparatorStyle.LLAMA4,
|
583
|
+
sep="",
|
584
|
+
stop_str=["<|end_of_text|>", "<|eot|>", "<|eom|>"],
|
585
|
+
image_token="<|image|>",
|
586
|
+
)
|
587
|
+
)
|
588
|
+
|
564
589
|
register_conv_template(
|
565
590
|
Conversation(
|
566
591
|
name="chatml",
|
sglang/srt/custom_op.py
CHANGED
@@ -50,6 +50,7 @@ if _is_cuda:
|
|
50
50
|
def scaled_fp8_quant(
|
51
51
|
input: torch.Tensor,
|
52
52
|
scale: Optional[torch.Tensor] = None,
|
53
|
+
num_token_padding: Optional[int] = None,
|
53
54
|
use_per_token_if_dynamic: bool = False,
|
54
55
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
55
56
|
"""
|
@@ -59,6 +60,8 @@ if _is_cuda:
|
|
59
60
|
input (torch.Tensor): Input tensor to be quantized
|
60
61
|
scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
|
61
62
|
If None, scales will be computed dynamically.
|
63
|
+
num_token_padding (Optional[int]): If specified, pad the first dimension
|
64
|
+
of the output to at least this value.
|
62
65
|
use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
|
63
66
|
determines the quantization granularity:
|
64
67
|
- True: compute scale per token
|
@@ -75,6 +78,8 @@ if _is_cuda:
|
|
75
78
|
assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
|
76
79
|
shape = input.shape
|
77
80
|
out_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
|
81
|
+
if num_token_padding:
|
82
|
+
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
78
83
|
output = torch.empty(shape, device=input.device, dtype=out_dtype)
|
79
84
|
|
80
85
|
if scale is None:
|
@@ -18,7 +18,7 @@ from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import
|
|
18
18
|
gpu_p2p_access_check,
|
19
19
|
)
|
20
20
|
from sglang.srt.distributed.parallel_state import in_the_same_node_as
|
21
|
-
from sglang.srt.utils import
|
21
|
+
from sglang.srt.utils import is_cuda, is_hip
|
22
22
|
|
23
23
|
logger = logging.getLogger(__name__)
|
24
24
|
|
@@ -217,7 +217,7 @@ class CustomAllreduce:
|
|
217
217
|
if cuda_visible_devices:
|
218
218
|
device_ids = list(map(int, cuda_visible_devices.split(",")))
|
219
219
|
else:
|
220
|
-
device_ids = list(range(
|
220
|
+
device_ids = list(range(torch.cuda.device_count()))
|
221
221
|
|
222
222
|
physical_device_id = device_ids[device.index]
|
223
223
|
tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
|
@@ -257,7 +257,7 @@ class CustomAllreduce:
|
|
257
257
|
self.world_size = world_size
|
258
258
|
self.full_nvlink = full_nvlink
|
259
259
|
|
260
|
-
if
|
260
|
+
if not _is_hip:
|
261
261
|
# Buffers memory are owned by this Python class and passed to C++.
|
262
262
|
# Meta data composes of two parts: meta data for synchronization and a
|
263
263
|
# temporary buffer for storing intermediate allreduce results.
|
@@ -280,56 +280,24 @@ class CustomAllreduce:
|
|
280
280
|
)
|
281
281
|
ops.register_buffer(self._ptr, self.buffer_ptrs)
|
282
282
|
else:
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
)
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
)
|
301
|
-
self.register_buffer(self.buffer)
|
302
|
-
self.MSCCL = os.getenv("RCCL_MSCCL_ENABLE", "1") == "1"
|
303
|
-
else:
|
304
|
-
# From TensorRT-LLM getMaxRequiredWorkspaceSize
|
305
|
-
self.max_required_workspace_size = [16 * 1024 * 1024, 8 * 1024 * 1024]
|
306
|
-
|
307
|
-
# sizeof(uint32_t) * (MAX_ALL_REDUCE_BLOCKS + 2) * MAX_RANKS_PER_NODE;
|
308
|
-
self.barrier_max_size = 8 * (36 + 2) * 8
|
309
|
-
|
310
|
-
self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
|
311
|
-
self.tmp_result_buffer_ptrs = self.create_shared_buffer(
|
312
|
-
max_size, group=group
|
313
|
-
)
|
314
|
-
self.rank_data_base = torch.empty(
|
315
|
-
8 * 1024 * 1024, dtype=torch.uint8, device=self.device
|
316
|
-
)
|
317
|
-
self.barrier_in_ptrs = self.create_shared_buffer(
|
318
|
-
self.barrier_max_size, group=group
|
319
|
-
)
|
320
|
-
self.barrier_out_ptrs = self.create_shared_buffer(
|
321
|
-
self.barrier_max_size, group=group
|
322
|
-
)
|
283
|
+
# meta data buffers need to be "uncached" for signal on MI200
|
284
|
+
self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
|
285
|
+
self.buffer = torch.empty(max_size, dtype=torch.uint8, device=self.device)
|
286
|
+
handle = ops.get_meta_buffer_ipc_handle(self.meta)
|
287
|
+
shard_data = (
|
288
|
+
bytes(handle), # ipc handle to base ptr
|
289
|
+
0, # offset of base ptr
|
290
|
+
)
|
291
|
+
handles, offsets = self._gather_ipc_meta(shard_data)
|
292
|
+
self.rank_data = torch.empty(
|
293
|
+
8 * 1024 * 1024, dtype=torch.uint8, device=self.device
|
294
|
+
)
|
295
|
+
self._ptr = ops.init_custom_ar(
|
296
|
+
self.meta, self.rank_data, handles, offsets, rank, self.full_nvlink
|
297
|
+
)
|
298
|
+
self.register_buffer(self.buffer)
|
299
|
+
self.MSCCL = os.getenv("RCCL_MSCCL_ENABLE", "1") == "1"
|
323
300
|
|
324
|
-
self._ptr = ops.init_custom_ar(
|
325
|
-
rank,
|
326
|
-
world_size,
|
327
|
-
self.rank_data_base,
|
328
|
-
self.buffer_ptrs,
|
329
|
-
self.tmp_result_buffer_ptrs,
|
330
|
-
self.barrier_in_ptrs,
|
331
|
-
self.barrier_out_ptrs,
|
332
|
-
)
|
333
301
|
self.disabled = False
|
334
302
|
|
335
303
|
@staticmethod
|
@@ -455,7 +423,7 @@ class CustomAllreduce:
|
|
455
423
|
return False
|
456
424
|
# for 4 or more non NVLink-capable GPUs, custom allreduce provides
|
457
425
|
# little performance improvement over NCCL.
|
458
|
-
if
|
426
|
+
if not _is_hip:
|
459
427
|
if self.world_size == 2 or self.full_nvlink:
|
460
428
|
return inp_size < self.max_size
|
461
429
|
return False
|
@@ -471,18 +439,6 @@ class CustomAllreduce:
|
|
471
439
|
return inp_size < self.max_size
|
472
440
|
return False
|
473
441
|
|
474
|
-
if self.world_size == 2:
|
475
|
-
return (
|
476
|
-
inp_size < self.max_size
|
477
|
-
and inp_size < self.max_required_workspace_size[0]
|
478
|
-
)
|
479
|
-
|
480
|
-
if self.full_nvlink:
|
481
|
-
return (
|
482
|
-
inp_size < self.max_size
|
483
|
-
and inp_size < self.max_required_workspace_size[1]
|
484
|
-
)
|
485
|
-
|
486
442
|
return False
|
487
443
|
|
488
444
|
# all reduce, assuming inp tensor is IPC registered with register_buffer,
|
@@ -515,15 +471,12 @@ class CustomAllreduce:
|
|
515
471
|
"""
|
516
472
|
if out is None:
|
517
473
|
out = torch.empty_like(inp)
|
518
|
-
if
|
519
|
-
|
520
|
-
ops.all_reduce(self._ptr, inp, out, 0, 0)
|
521
|
-
else:
|
522
|
-
ops.all_reduce(
|
523
|
-
self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
|
524
|
-
)
|
474
|
+
if registered:
|
475
|
+
ops.all_reduce(self._ptr, inp, out, 0, 0)
|
525
476
|
else:
|
526
|
-
ops.all_reduce(
|
477
|
+
ops.all_reduce(
|
478
|
+
self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
|
479
|
+
)
|
527
480
|
return out
|
528
481
|
|
529
482
|
def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
|
@@ -554,14 +507,9 @@ class CustomAllreduce:
|
|
554
507
|
def close(self):
|
555
508
|
if not self.disabled and self._ptr:
|
556
509
|
ops.dispose(self._ptr)
|
557
|
-
if
|
510
|
+
if _is_cuda:
|
558
511
|
self.free_shared_buffer(self.meta_ptrs)
|
559
512
|
self.free_shared_buffer(self.buffer_ptrs)
|
560
|
-
elif _is_cuda:
|
561
|
-
self.free_shared_buffer(self.buffer_ptrs)
|
562
|
-
self.free_shared_buffer(self.tmp_result_buffer_ptrs)
|
563
|
-
self.free_shared_buffer(self.barrier_in_ptrs)
|
564
|
-
self.free_shared_buffer(self.barrier_out_ptrs)
|
565
513
|
self._ptr = 0
|
566
514
|
|
567
515
|
def __del__(self):
|
@@ -11,11 +11,11 @@ import tempfile
|
|
11
11
|
from itertools import product
|
12
12
|
from typing import Dict, List, Optional, Sequence
|
13
13
|
|
14
|
+
import torch
|
14
15
|
import torch.distributed as dist
|
15
16
|
import torch.multiprocessing as mp
|
16
17
|
|
17
18
|
from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
18
|
-
from sglang.srt.utils import cuda_device_count_stateless
|
19
19
|
|
20
20
|
logger = logging.getLogger(__name__)
|
21
21
|
|
@@ -218,7 +218,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
|
|
218
218
|
|
219
219
|
is_distributed = dist.is_initialized()
|
220
220
|
|
221
|
-
num_dev =
|
221
|
+
num_dev = torch.cuda.device_count()
|
222
222
|
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
|
223
223
|
if cuda_visible_devices is None:
|
224
224
|
cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
|
sglang/srt/entrypoints/engine.py
CHANGED
@@ -151,10 +151,6 @@ class Engine:
|
|
151
151
|
The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
|
152
152
|
Please refer to `GenerateReqInput` for the documentation.
|
153
153
|
"""
|
154
|
-
modalities_list = []
|
155
|
-
if image_data is not None:
|
156
|
-
modalities_list.append("image")
|
157
|
-
|
158
154
|
obj = GenerateReqInput(
|
159
155
|
text=prompt,
|
160
156
|
input_ids=input_ids,
|
@@ -165,7 +161,6 @@ class Engine:
|
|
165
161
|
top_logprobs_num=top_logprobs_num,
|
166
162
|
token_ids_logprob=token_ids_logprob,
|
167
163
|
lora_path=lora_path,
|
168
|
-
modalities=modalities_list,
|
169
164
|
custom_logit_processor=custom_logit_processor,
|
170
165
|
return_hidden_states=return_hidden_states,
|
171
166
|
stream=stream,
|