sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/chat_template.py +21 -0
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/model_config.py +4 -0
- sglang/srt/constrained/base_grammar_backend.py +10 -2
- sglang/srt/constrained/xgrammar_backend.py +7 -5
- sglang/srt/conversation.py +16 -1
- sglang/srt/debug_utils/__init__.py +0 -0
- sglang/srt/debug_utils/dump_comparator.py +131 -0
- sglang/srt/debug_utils/dumper.py +108 -0
- sglang/srt/debug_utils/text_comparator.py +172 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
- sglang/srt/disaggregation/mooncake/conn.py +16 -0
- sglang/srt/disaggregation/prefill.py +13 -1
- sglang/srt/entrypoints/engine.py +4 -2
- sglang/srt/entrypoints/openai/serving_chat.py +132 -79
- sglang/srt/function_call/ebnf_composer.py +10 -3
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +164 -0
- sglang/srt/function_call/qwen3_coder_detector.py +1 -0
- sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
- sglang/srt/layers/attention/vision.py +56 -8
- sglang/srt/layers/layernorm.py +26 -1
- sglang/srt/layers/logits_processor.py +14 -3
- sglang/srt/layers/moe/ep_moe/layer.py +172 -206
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
- sglang/srt/layers/moe/topk.py +84 -22
- sglang/srt/layers/multimodal.py +11 -8
- sglang/srt/layers/quantization/fp8.py +25 -247
- sglang/srt/layers/quantization/fp8_kernel.py +78 -48
- sglang/srt/layers/quantization/modelopt_quant.py +25 -10
- sglang/srt/layers/quantization/unquant.py +24 -76
- sglang/srt/layers/quantization/w4afp8.py +68 -17
- sglang/srt/lora/lora_registry.py +93 -29
- sglang/srt/managers/cache_controller.py +9 -7
- sglang/srt/managers/mm_utils.py +154 -35
- sglang/srt/managers/multimodal_processor.py +3 -14
- sglang/srt/managers/schedule_batch.py +14 -8
- sglang/srt/managers/scheduler.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +37 -6
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/mem_cache/hiradix_cache.py +5 -2
- sglang/srt/model_executor/model_runner.py +68 -14
- sglang/srt/models/deepseek_v2.py +62 -28
- sglang/srt/models/glm4_moe.py +1035 -0
- sglang/srt/models/glm4_moe_nextn.py +167 -0
- sglang/srt/models/interns1.py +328 -0
- sglang/srt/models/internvl.py +143 -47
- sglang/srt/models/llava.py +9 -5
- sglang/srt/models/minicpmo.py +4 -1
- sglang/srt/models/qwen2_moe.py +2 -2
- sglang/srt/models/qwen3_moe.py +5 -2
- sglang/srt/multimodal/processors/base_processor.py +20 -6
- sglang/srt/multimodal/processors/clip.py +2 -2
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
- sglang/srt/multimodal/processors/gemma3.py +2 -2
- sglang/srt/multimodal/processors/gemma3n.py +2 -2
- sglang/srt/multimodal/processors/internvl.py +21 -8
- sglang/srt/multimodal/processors/janus_pro.py +2 -2
- sglang/srt/multimodal/processors/kimi_vl.py +2 -2
- sglang/srt/multimodal/processors/llava.py +4 -4
- sglang/srt/multimodal/processors/minicpm.py +2 -3
- sglang/srt/multimodal/processors/mlama.py +2 -2
- sglang/srt/multimodal/processors/mllama4.py +18 -111
- sglang/srt/multimodal/processors/phi4mm.py +2 -2
- sglang/srt/multimodal/processors/pixtral.py +2 -2
- sglang/srt/multimodal/processors/qwen_audio.py +2 -2
- sglang/srt/multimodal/processors/qwen_vl.py +2 -2
- sglang/srt/multimodal/processors/vila.py +3 -1
- sglang/srt/reasoning_parser.py +2 -1
- sglang/srt/server_args.py +57 -6
- sglang/srt/utils.py +96 -1
- sglang/srt/weight_sync/utils.py +119 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_utils.py +65 -5
- sglang/utils.py +19 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +4 -4
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +83 -73
- sglang/srt/debug_utils.py +0 -74
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0
@@ -1308,9 +1308,58 @@ class ModelRunner:
|
|
1308
1308
|
else:
|
1309
1309
|
self.attn_backend = self._get_attention_backend()
|
1310
1310
|
|
1311
|
-
# TODO unify with 6338
|
1312
1311
|
def _get_attention_backend(self):
|
1313
|
-
|
1312
|
+
"""Init attention kernel backend."""
|
1313
|
+
self.decode_attention_backend_str = (
|
1314
|
+
self.server_args.decode_attention_backend
|
1315
|
+
if self.server_args.decode_attention_backend
|
1316
|
+
else self.server_args.attention_backend
|
1317
|
+
)
|
1318
|
+
self.prefill_attention_backend_str = (
|
1319
|
+
self.server_args.prefill_attention_backend
|
1320
|
+
if self.server_args.prefill_attention_backend
|
1321
|
+
else self.server_args.attention_backend
|
1322
|
+
)
|
1323
|
+
if self.decode_attention_backend_str != self.prefill_attention_backend_str:
|
1324
|
+
assert (
|
1325
|
+
self.server_args.speculative_algorithm is None
|
1326
|
+
), "Currently HybridAttentionBackend does not support speculative decoding."
|
1327
|
+
from sglang.srt.layers.attention.hybrid_attn_backend import (
|
1328
|
+
HybridAttnBackend,
|
1329
|
+
)
|
1330
|
+
|
1331
|
+
attn_backend = HybridAttnBackend(
|
1332
|
+
decode_backend=self._get_attention_backend_from_str(
|
1333
|
+
self.decode_attention_backend_str
|
1334
|
+
),
|
1335
|
+
prefill_backend=self._get_attention_backend_from_str(
|
1336
|
+
self.prefill_attention_backend_str
|
1337
|
+
),
|
1338
|
+
)
|
1339
|
+
logger.info(
|
1340
|
+
f"Using hybrid attention backend for decode and prefill: "
|
1341
|
+
f"decode_backend={self.decode_attention_backend_str}, "
|
1342
|
+
f"prefill_backend={self.prefill_attention_backend_str}."
|
1343
|
+
)
|
1344
|
+
logger.warning(
|
1345
|
+
f"Warning: Attention backend specified by --attention-backend or default backend might be overridden."
|
1346
|
+
f"The feature of hybrid attention backend is experimental and unstable. Please raise an issue if you encounter any problem."
|
1347
|
+
)
|
1348
|
+
else:
|
1349
|
+
attn_backend = self._get_attention_backend_from_str(
|
1350
|
+
self.server_args.attention_backend
|
1351
|
+
)
|
1352
|
+
|
1353
|
+
global_server_args_dict.update(
|
1354
|
+
{
|
1355
|
+
"decode_attention_backend": self.decode_attention_backend_str,
|
1356
|
+
"prefill_attention_backend": self.prefill_attention_backend_str,
|
1357
|
+
}
|
1358
|
+
)
|
1359
|
+
return attn_backend
|
1360
|
+
|
1361
|
+
def _get_attention_backend_from_str(self, backend_str: str):
|
1362
|
+
if backend_str == "flashinfer":
|
1314
1363
|
if not self.use_mla_backend:
|
1315
1364
|
from sglang.srt.layers.attention.flashinfer_backend import (
|
1316
1365
|
FlashInferAttnBackend,
|
@@ -1318,7 +1367,11 @@ class ModelRunner:
|
|
1318
1367
|
|
1319
1368
|
# Init streams
|
1320
1369
|
if self.server_args.speculative_algorithm == "EAGLE":
|
1321
|
-
|
1370
|
+
if (
|
1371
|
+
not hasattr(self, "plan_stream_for_flashinfer")
|
1372
|
+
or not self.plan_stream_for_flashinfer
|
1373
|
+
):
|
1374
|
+
self.plan_stream_for_flashinfer = torch.cuda.Stream()
|
1322
1375
|
return FlashInferAttnBackend(self)
|
1323
1376
|
else:
|
1324
1377
|
from sglang.srt.layers.attention.flashinfer_mla_backend import (
|
@@ -1326,15 +1379,15 @@ class ModelRunner:
|
|
1326
1379
|
)
|
1327
1380
|
|
1328
1381
|
return FlashInferMLAAttnBackend(self)
|
1329
|
-
elif
|
1382
|
+
elif backend_str == "aiter":
|
1330
1383
|
from sglang.srt.layers.attention.aiter_backend import AiterAttnBackend
|
1331
1384
|
|
1332
1385
|
return AiterAttnBackend(self)
|
1333
|
-
elif
|
1386
|
+
elif backend_str == "ascend":
|
1334
1387
|
from sglang.srt.layers.attention.ascend_backend import AscendAttnBackend
|
1335
1388
|
|
1336
1389
|
return AscendAttnBackend(self)
|
1337
|
-
elif
|
1390
|
+
elif backend_str == "triton":
|
1338
1391
|
assert not self.model_config.is_encoder_decoder, (
|
1339
1392
|
"Cross attention is not supported in the triton attention backend. "
|
1340
1393
|
"Please use `--attention-backend flashinfer`."
|
@@ -1349,17 +1402,17 @@ class ModelRunner:
|
|
1349
1402
|
from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
|
1350
1403
|
|
1351
1404
|
return TritonAttnBackend(self)
|
1352
|
-
elif
|
1405
|
+
elif backend_str == "torch_native":
|
1353
1406
|
from sglang.srt.layers.attention.torch_native_backend import (
|
1354
1407
|
TorchNativeAttnBackend,
|
1355
1408
|
)
|
1356
1409
|
|
1357
1410
|
return TorchNativeAttnBackend(self)
|
1358
|
-
elif
|
1411
|
+
elif backend_str == "flashmla":
|
1359
1412
|
from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend
|
1360
1413
|
|
1361
1414
|
return FlashMLABackend(self)
|
1362
|
-
elif
|
1415
|
+
elif backend_str == "fa3":
|
1363
1416
|
assert (
|
1364
1417
|
torch.cuda.get_device_capability()[0] == 8 and not self.use_mla_backend
|
1365
1418
|
) or torch.cuda.get_device_capability()[0] == 9, (
|
@@ -1371,7 +1424,7 @@ class ModelRunner:
|
|
1371
1424
|
)
|
1372
1425
|
|
1373
1426
|
return FlashAttentionBackend(self)
|
1374
|
-
elif
|
1427
|
+
elif backend_str == "cutlass_mla":
|
1375
1428
|
from sglang.srt.layers.attention.cutlass_mla_backend import (
|
1376
1429
|
CutlassMLABackend,
|
1377
1430
|
)
|
@@ -1385,9 +1438,7 @@ class ModelRunner:
|
|
1385
1438
|
logger.info(f"Intel AMX attention backend is enabled.")
|
1386
1439
|
return IntelAMXAttnBackend(self)
|
1387
1440
|
else:
|
1388
|
-
raise ValueError(
|
1389
|
-
f"Invalid attention backend: {self.server_args.attention_backend}"
|
1390
|
-
)
|
1441
|
+
raise ValueError(f"Invalid attention backend: {backend_str}")
|
1391
1442
|
|
1392
1443
|
def init_double_sparsity_channel_config(self, selected_channel):
|
1393
1444
|
selected_channel = "." + selected_channel + "_proj"
|
@@ -1475,7 +1526,10 @@ class ModelRunner:
|
|
1475
1526
|
if self.support_pp:
|
1476
1527
|
kwargs["pp_proxy_tensors"] = pp_proxy_tensors
|
1477
1528
|
return self.model.forward(
|
1478
|
-
forward_batch.input_ids,
|
1529
|
+
forward_batch.input_ids,
|
1530
|
+
forward_batch.positions,
|
1531
|
+
forward_batch,
|
1532
|
+
**kwargs,
|
1479
1533
|
)
|
1480
1534
|
|
1481
1535
|
def forward_extend(
|
sglang/srt/models/deepseek_v2.py
CHANGED
@@ -56,7 +56,11 @@ from sglang.srt.layers.linear import (
|
|
56
56
|
RowParallelLinear,
|
57
57
|
)
|
58
58
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
59
|
-
from sglang.srt.layers.moe.ep_moe.layer import
|
59
|
+
from sglang.srt.layers.moe.ep_moe.layer import (
|
60
|
+
DeepEPMoE,
|
61
|
+
get_moe_impl_class,
|
62
|
+
use_flashinfer_trtllm_moe,
|
63
|
+
)
|
60
64
|
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
|
61
65
|
from sglang.srt.layers.moe.topk import TopK
|
62
66
|
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
@@ -302,15 +306,19 @@ class DeepseekV2MoE(nn.Module):
|
|
302
306
|
config=config, prefix=add_prefix("gate", prefix), is_nextn=is_nextn
|
303
307
|
)
|
304
308
|
|
305
|
-
self.topk =
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
309
|
+
self.topk = (
|
310
|
+
TopK(
|
311
|
+
top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
|
312
|
+
renormalize=config.norm_topk_prob,
|
313
|
+
use_grouped_topk=True,
|
314
|
+
num_expert_group=config.n_group,
|
315
|
+
num_fused_shared_experts=self.num_fused_shared_experts,
|
316
|
+
topk_group=config.topk_group,
|
317
|
+
correction_bias=self.gate.e_score_correction_bias,
|
318
|
+
routed_scaling_factor=self.routed_scaling_factor,
|
319
|
+
)
|
320
|
+
if not use_flashinfer_trtllm_moe
|
321
|
+
else None
|
314
322
|
)
|
315
323
|
|
316
324
|
self.experts = get_moe_impl_class()(
|
@@ -332,10 +340,22 @@ class DeepseekV2MoE(nn.Module):
|
|
332
340
|
# Additional args for FusedMoE
|
333
341
|
**(
|
334
342
|
dict(
|
335
|
-
|
343
|
+
enable_flashinfer_cutlass_moe=True,
|
336
344
|
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
|
337
345
|
)
|
338
|
-
if global_server_args_dict["
|
346
|
+
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
347
|
+
else {}
|
348
|
+
),
|
349
|
+
**(
|
350
|
+
dict(
|
351
|
+
renormalize=config.norm_topk_prob,
|
352
|
+
use_grouped_topk=True,
|
353
|
+
num_expert_group=config.n_group,
|
354
|
+
num_fused_shared_experts=self.num_fused_shared_experts,
|
355
|
+
topk_group=config.topk_group,
|
356
|
+
correction_bias=self.gate.e_score_correction_bias,
|
357
|
+
)
|
358
|
+
if use_flashinfer_trtllm_moe
|
339
359
|
else {}
|
340
360
|
),
|
341
361
|
)
|
@@ -455,10 +475,12 @@ class DeepseekV2MoE(nn.Module):
|
|
455
475
|
with torch.cuda.stream(self.alt_stream):
|
456
476
|
# router_logits: (num_tokens, n_experts)
|
457
477
|
router_logits = self.gate(hidden_states)
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
478
|
+
kwargs = {"hidden_states": hidden_states}
|
479
|
+
if self.topk is not None:
|
480
|
+
kwargs["topk_output"] = self.topk(hidden_states, router_logits)
|
481
|
+
else:
|
482
|
+
kwargs["router_logits"] = router_logits
|
483
|
+
final_hidden_states = self.experts(**kwargs)
|
462
484
|
if not _is_cuda:
|
463
485
|
final_hidden_states *= self.routed_scaling_factor
|
464
486
|
current_stream.wait_stream(self.alt_stream)
|
@@ -478,10 +500,12 @@ class DeepseekV2MoE(nn.Module):
|
|
478
500
|
shared_output = self._forward_shared_experts(hidden_states)
|
479
501
|
# router_logits: (num_tokens, n_experts)
|
480
502
|
router_logits = self.gate(hidden_states)
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
503
|
+
kwargs = {"hidden_states": hidden_states}
|
504
|
+
if self.topk is not None:
|
505
|
+
kwargs["topk_output"] = self.topk(hidden_states, router_logits)
|
506
|
+
else:
|
507
|
+
kwargs["router_logits"] = router_logits
|
508
|
+
final_hidden_states = self.experts(**kwargs)
|
485
509
|
if not _is_cuda and not _use_aiter:
|
486
510
|
# fused in biased_grouped_topk so we can skip here
|
487
511
|
final_hidden_states *= self.routed_scaling_factor
|
@@ -901,7 +925,10 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|
901
925
|
self.disable_chunked_prefix_cache = global_server_args_dict[
|
902
926
|
"disable_chunked_prefix_cache"
|
903
927
|
]
|
904
|
-
|
928
|
+
|
929
|
+
self.current_attention_backend = (
|
930
|
+
None # Attention backend used by current forward batch
|
931
|
+
)
|
905
932
|
self.rocm_fused_decode_mla = get_bool_env_var(
|
906
933
|
"SGLANG_ROCM_FUSED_DECODE_MLA", "false"
|
907
934
|
)
|
@@ -985,9 +1012,16 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|
985
1012
|
else:
|
986
1013
|
return AttnForwardMethod.MLA
|
987
1014
|
|
988
|
-
|
1015
|
+
# Determine attention backend used by current forward batch
|
1016
|
+
if forward_batch.forward_mode.is_decode_or_idle():
|
1017
|
+
attention_backend = global_server_args_dict["decode_attention_backend"]
|
1018
|
+
else:
|
1019
|
+
attention_backend = global_server_args_dict["prefill_attention_backend"]
|
1020
|
+
self.current_attention_backend = attention_backend
|
1021
|
+
|
1022
|
+
if attention_backend == "ascend":
|
989
1023
|
return AttnForwardMethod.MLA
|
990
|
-
elif
|
1024
|
+
elif attention_backend == "flashinfer":
|
991
1025
|
# Flashinfer MLA: Do not absorb when enabling ragged prefill
|
992
1026
|
if (
|
993
1027
|
not self.flashinfer_mla_disable_ragged
|
@@ -999,7 +1033,7 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|
999
1033
|
return AttnForwardMethod.MHA
|
1000
1034
|
else:
|
1001
1035
|
return _dispatch_mla_subtype()
|
1002
|
-
elif
|
1036
|
+
elif attention_backend == "fa3":
|
1003
1037
|
# Flash Attention: Use MHA with chunked KV cache when prefilling on long sequences.
|
1004
1038
|
if forward_batch.extend_prefix_lens_cpu is not None:
|
1005
1039
|
sum_extend_prefix_lens = sum(forward_batch.extend_prefix_lens_cpu)
|
@@ -1016,7 +1050,7 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|
1016
1050
|
return AttnForwardMethod.MHA_CHUNKED_KV
|
1017
1051
|
else:
|
1018
1052
|
return _dispatch_mla_subtype()
|
1019
|
-
elif
|
1053
|
+
elif attention_backend == "aiter":
|
1020
1054
|
if (
|
1021
1055
|
forward_batch.forward_mode.is_extend()
|
1022
1056
|
and not forward_batch.forward_mode.is_target_verify()
|
@@ -1264,9 +1298,9 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|
1264
1298
|
self, q_pe, k_pe, q_nope_out, k_nope, forward_batch, zero_allocator
|
1265
1299
|
):
|
1266
1300
|
if (
|
1267
|
-
self.
|
1268
|
-
or self.
|
1269
|
-
or self.
|
1301
|
+
self.current_attention_backend == "fa3"
|
1302
|
+
or self.current_attention_backend == "flashinfer"
|
1303
|
+
or self.current_attention_backend == "cutlass_mla"
|
1270
1304
|
):
|
1271
1305
|
attn_output = self.attn_mqa(
|
1272
1306
|
q_nope_out, k_nope, k_nope, forward_batch, q_rope=q_pe, k_rope=k_pe
|