sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. sglang/lang/chat_template.py +21 -0
  2. sglang/srt/configs/internvl.py +3 -0
  3. sglang/srt/configs/model_config.py +4 -0
  4. sglang/srt/constrained/base_grammar_backend.py +10 -2
  5. sglang/srt/constrained/xgrammar_backend.py +7 -5
  6. sglang/srt/conversation.py +16 -1
  7. sglang/srt/debug_utils/__init__.py +0 -0
  8. sglang/srt/debug_utils/dump_comparator.py +131 -0
  9. sglang/srt/debug_utils/dumper.py +108 -0
  10. sglang/srt/debug_utils/text_comparator.py +172 -0
  11. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
  12. sglang/srt/disaggregation/mooncake/conn.py +16 -0
  13. sglang/srt/disaggregation/prefill.py +13 -1
  14. sglang/srt/entrypoints/engine.py +4 -2
  15. sglang/srt/entrypoints/openai/serving_chat.py +132 -79
  16. sglang/srt/function_call/ebnf_composer.py +10 -3
  17. sglang/srt/function_call/function_call_parser.py +2 -0
  18. sglang/srt/function_call/glm4_moe_detector.py +164 -0
  19. sglang/srt/function_call/qwen3_coder_detector.py +1 -0
  20. sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
  21. sglang/srt/layers/attention/vision.py +56 -8
  22. sglang/srt/layers/layernorm.py +26 -1
  23. sglang/srt/layers/logits_processor.py +14 -3
  24. sglang/srt/layers/moe/ep_moe/layer.py +172 -206
  25. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
  27. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
  28. sglang/srt/layers/moe/topk.py +84 -22
  29. sglang/srt/layers/multimodal.py +11 -8
  30. sglang/srt/layers/quantization/fp8.py +25 -247
  31. sglang/srt/layers/quantization/fp8_kernel.py +78 -48
  32. sglang/srt/layers/quantization/modelopt_quant.py +25 -10
  33. sglang/srt/layers/quantization/unquant.py +24 -76
  34. sglang/srt/layers/quantization/w4afp8.py +68 -17
  35. sglang/srt/lora/lora_registry.py +93 -29
  36. sglang/srt/managers/cache_controller.py +9 -7
  37. sglang/srt/managers/mm_utils.py +154 -35
  38. sglang/srt/managers/multimodal_processor.py +3 -14
  39. sglang/srt/managers/schedule_batch.py +14 -8
  40. sglang/srt/managers/scheduler.py +35 -1
  41. sglang/srt/managers/tokenizer_manager.py +37 -6
  42. sglang/srt/managers/tp_worker.py +3 -0
  43. sglang/srt/mem_cache/hiradix_cache.py +5 -2
  44. sglang/srt/model_executor/model_runner.py +68 -14
  45. sglang/srt/models/deepseek_v2.py +62 -28
  46. sglang/srt/models/glm4_moe.py +1035 -0
  47. sglang/srt/models/glm4_moe_nextn.py +167 -0
  48. sglang/srt/models/interns1.py +328 -0
  49. sglang/srt/models/internvl.py +143 -47
  50. sglang/srt/models/llava.py +9 -5
  51. sglang/srt/models/minicpmo.py +4 -1
  52. sglang/srt/models/qwen2_moe.py +2 -2
  53. sglang/srt/models/qwen3_moe.py +5 -2
  54. sglang/srt/multimodal/processors/base_processor.py +20 -6
  55. sglang/srt/multimodal/processors/clip.py +2 -2
  56. sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
  57. sglang/srt/multimodal/processors/gemma3.py +2 -2
  58. sglang/srt/multimodal/processors/gemma3n.py +2 -2
  59. sglang/srt/multimodal/processors/internvl.py +21 -8
  60. sglang/srt/multimodal/processors/janus_pro.py +2 -2
  61. sglang/srt/multimodal/processors/kimi_vl.py +2 -2
  62. sglang/srt/multimodal/processors/llava.py +4 -4
  63. sglang/srt/multimodal/processors/minicpm.py +2 -3
  64. sglang/srt/multimodal/processors/mlama.py +2 -2
  65. sglang/srt/multimodal/processors/mllama4.py +18 -111
  66. sglang/srt/multimodal/processors/phi4mm.py +2 -2
  67. sglang/srt/multimodal/processors/pixtral.py +2 -2
  68. sglang/srt/multimodal/processors/qwen_audio.py +2 -2
  69. sglang/srt/multimodal/processors/qwen_vl.py +2 -2
  70. sglang/srt/multimodal/processors/vila.py +3 -1
  71. sglang/srt/reasoning_parser.py +2 -1
  72. sglang/srt/server_args.py +57 -6
  73. sglang/srt/utils.py +96 -1
  74. sglang/srt/weight_sync/utils.py +119 -0
  75. sglang/test/runners.py +4 -0
  76. sglang/test/test_utils.py +65 -5
  77. sglang/utils.py +19 -0
  78. sglang/version.py +1 -1
  79. {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +4 -4
  80. {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +83 -73
  81. sglang/srt/debug_utils.py +0 -74
  82. {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
  83. {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
  84. {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0
@@ -1308,9 +1308,58 @@ class ModelRunner:
1308
1308
  else:
1309
1309
  self.attn_backend = self._get_attention_backend()
1310
1310
 
1311
- # TODO unify with 6338
1312
1311
  def _get_attention_backend(self):
1313
- if self.server_args.attention_backend == "flashinfer":
1312
+ """Init attention kernel backend."""
1313
+ self.decode_attention_backend_str = (
1314
+ self.server_args.decode_attention_backend
1315
+ if self.server_args.decode_attention_backend
1316
+ else self.server_args.attention_backend
1317
+ )
1318
+ self.prefill_attention_backend_str = (
1319
+ self.server_args.prefill_attention_backend
1320
+ if self.server_args.prefill_attention_backend
1321
+ else self.server_args.attention_backend
1322
+ )
1323
+ if self.decode_attention_backend_str != self.prefill_attention_backend_str:
1324
+ assert (
1325
+ self.server_args.speculative_algorithm is None
1326
+ ), "Currently HybridAttentionBackend does not support speculative decoding."
1327
+ from sglang.srt.layers.attention.hybrid_attn_backend import (
1328
+ HybridAttnBackend,
1329
+ )
1330
+
1331
+ attn_backend = HybridAttnBackend(
1332
+ decode_backend=self._get_attention_backend_from_str(
1333
+ self.decode_attention_backend_str
1334
+ ),
1335
+ prefill_backend=self._get_attention_backend_from_str(
1336
+ self.prefill_attention_backend_str
1337
+ ),
1338
+ )
1339
+ logger.info(
1340
+ f"Using hybrid attention backend for decode and prefill: "
1341
+ f"decode_backend={self.decode_attention_backend_str}, "
1342
+ f"prefill_backend={self.prefill_attention_backend_str}."
1343
+ )
1344
+ logger.warning(
1345
+ f"Warning: Attention backend specified by --attention-backend or default backend might be overridden."
1346
+ f"The feature of hybrid attention backend is experimental and unstable. Please raise an issue if you encounter any problem."
1347
+ )
1348
+ else:
1349
+ attn_backend = self._get_attention_backend_from_str(
1350
+ self.server_args.attention_backend
1351
+ )
1352
+
1353
+ global_server_args_dict.update(
1354
+ {
1355
+ "decode_attention_backend": self.decode_attention_backend_str,
1356
+ "prefill_attention_backend": self.prefill_attention_backend_str,
1357
+ }
1358
+ )
1359
+ return attn_backend
1360
+
1361
+ def _get_attention_backend_from_str(self, backend_str: str):
1362
+ if backend_str == "flashinfer":
1314
1363
  if not self.use_mla_backend:
1315
1364
  from sglang.srt.layers.attention.flashinfer_backend import (
1316
1365
  FlashInferAttnBackend,
@@ -1318,7 +1367,11 @@ class ModelRunner:
1318
1367
 
1319
1368
  # Init streams
1320
1369
  if self.server_args.speculative_algorithm == "EAGLE":
1321
- self.plan_stream_for_flashinfer = torch.cuda.Stream()
1370
+ if (
1371
+ not hasattr(self, "plan_stream_for_flashinfer")
1372
+ or not self.plan_stream_for_flashinfer
1373
+ ):
1374
+ self.plan_stream_for_flashinfer = torch.cuda.Stream()
1322
1375
  return FlashInferAttnBackend(self)
1323
1376
  else:
1324
1377
  from sglang.srt.layers.attention.flashinfer_mla_backend import (
@@ -1326,15 +1379,15 @@ class ModelRunner:
1326
1379
  )
1327
1380
 
1328
1381
  return FlashInferMLAAttnBackend(self)
1329
- elif self.server_args.attention_backend == "aiter":
1382
+ elif backend_str == "aiter":
1330
1383
  from sglang.srt.layers.attention.aiter_backend import AiterAttnBackend
1331
1384
 
1332
1385
  return AiterAttnBackend(self)
1333
- elif self.server_args.attention_backend == "ascend":
1386
+ elif backend_str == "ascend":
1334
1387
  from sglang.srt.layers.attention.ascend_backend import AscendAttnBackend
1335
1388
 
1336
1389
  return AscendAttnBackend(self)
1337
- elif self.server_args.attention_backend == "triton":
1390
+ elif backend_str == "triton":
1338
1391
  assert not self.model_config.is_encoder_decoder, (
1339
1392
  "Cross attention is not supported in the triton attention backend. "
1340
1393
  "Please use `--attention-backend flashinfer`."
@@ -1349,17 +1402,17 @@ class ModelRunner:
1349
1402
  from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
1350
1403
 
1351
1404
  return TritonAttnBackend(self)
1352
- elif self.server_args.attention_backend == "torch_native":
1405
+ elif backend_str == "torch_native":
1353
1406
  from sglang.srt.layers.attention.torch_native_backend import (
1354
1407
  TorchNativeAttnBackend,
1355
1408
  )
1356
1409
 
1357
1410
  return TorchNativeAttnBackend(self)
1358
- elif self.server_args.attention_backend == "flashmla":
1411
+ elif backend_str == "flashmla":
1359
1412
  from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend
1360
1413
 
1361
1414
  return FlashMLABackend(self)
1362
- elif self.server_args.attention_backend == "fa3":
1415
+ elif backend_str == "fa3":
1363
1416
  assert (
1364
1417
  torch.cuda.get_device_capability()[0] == 8 and not self.use_mla_backend
1365
1418
  ) or torch.cuda.get_device_capability()[0] == 9, (
@@ -1371,7 +1424,7 @@ class ModelRunner:
1371
1424
  )
1372
1425
 
1373
1426
  return FlashAttentionBackend(self)
1374
- elif self.server_args.attention_backend == "cutlass_mla":
1427
+ elif backend_str == "cutlass_mla":
1375
1428
  from sglang.srt.layers.attention.cutlass_mla_backend import (
1376
1429
  CutlassMLABackend,
1377
1430
  )
@@ -1385,9 +1438,7 @@ class ModelRunner:
1385
1438
  logger.info(f"Intel AMX attention backend is enabled.")
1386
1439
  return IntelAMXAttnBackend(self)
1387
1440
  else:
1388
- raise ValueError(
1389
- f"Invalid attention backend: {self.server_args.attention_backend}"
1390
- )
1441
+ raise ValueError(f"Invalid attention backend: {backend_str}")
1391
1442
 
1392
1443
  def init_double_sparsity_channel_config(self, selected_channel):
1393
1444
  selected_channel = "." + selected_channel + "_proj"
@@ -1475,7 +1526,10 @@ class ModelRunner:
1475
1526
  if self.support_pp:
1476
1527
  kwargs["pp_proxy_tensors"] = pp_proxy_tensors
1477
1528
  return self.model.forward(
1478
- forward_batch.input_ids, forward_batch.positions, forward_batch, **kwargs
1529
+ forward_batch.input_ids,
1530
+ forward_batch.positions,
1531
+ forward_batch,
1532
+ **kwargs,
1479
1533
  )
1480
1534
 
1481
1535
  def forward_extend(
@@ -56,7 +56,11 @@ from sglang.srt.layers.linear import (
56
56
  RowParallelLinear,
57
57
  )
58
58
  from sglang.srt.layers.logits_processor import LogitsProcessor
59
- from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
59
+ from sglang.srt.layers.moe.ep_moe.layer import (
60
+ DeepEPMoE,
61
+ get_moe_impl_class,
62
+ use_flashinfer_trtllm_moe,
63
+ )
60
64
  from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
61
65
  from sglang.srt.layers.moe.topk import TopK
62
66
  from sglang.srt.layers.quantization import deep_gemm_wrapper
@@ -302,15 +306,19 @@ class DeepseekV2MoE(nn.Module):
302
306
  config=config, prefix=add_prefix("gate", prefix), is_nextn=is_nextn
303
307
  )
304
308
 
305
- self.topk = TopK(
306
- top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
307
- renormalize=config.norm_topk_prob,
308
- use_grouped_topk=True,
309
- num_expert_group=config.n_group,
310
- num_fused_shared_experts=self.num_fused_shared_experts,
311
- topk_group=config.topk_group,
312
- correction_bias=self.gate.e_score_correction_bias,
313
- routed_scaling_factor=self.routed_scaling_factor,
309
+ self.topk = (
310
+ TopK(
311
+ top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
312
+ renormalize=config.norm_topk_prob,
313
+ use_grouped_topk=True,
314
+ num_expert_group=config.n_group,
315
+ num_fused_shared_experts=self.num_fused_shared_experts,
316
+ topk_group=config.topk_group,
317
+ correction_bias=self.gate.e_score_correction_bias,
318
+ routed_scaling_factor=self.routed_scaling_factor,
319
+ )
320
+ if not use_flashinfer_trtllm_moe
321
+ else None
314
322
  )
315
323
 
316
324
  self.experts = get_moe_impl_class()(
@@ -332,10 +340,22 @@ class DeepseekV2MoE(nn.Module):
332
340
  # Additional args for FusedMoE
333
341
  **(
334
342
  dict(
335
- enable_flashinfer_moe=True,
343
+ enable_flashinfer_cutlass_moe=True,
336
344
  enable_ep_moe=global_server_args_dict["enable_ep_moe"],
337
345
  )
338
- if global_server_args_dict["enable_flashinfer_moe"]
346
+ if global_server_args_dict["enable_flashinfer_cutlass_moe"]
347
+ else {}
348
+ ),
349
+ **(
350
+ dict(
351
+ renormalize=config.norm_topk_prob,
352
+ use_grouped_topk=True,
353
+ num_expert_group=config.n_group,
354
+ num_fused_shared_experts=self.num_fused_shared_experts,
355
+ topk_group=config.topk_group,
356
+ correction_bias=self.gate.e_score_correction_bias,
357
+ )
358
+ if use_flashinfer_trtllm_moe
339
359
  else {}
340
360
  ),
341
361
  )
@@ -455,10 +475,12 @@ class DeepseekV2MoE(nn.Module):
455
475
  with torch.cuda.stream(self.alt_stream):
456
476
  # router_logits: (num_tokens, n_experts)
457
477
  router_logits = self.gate(hidden_states)
458
- topk_output = self.topk(hidden_states, router_logits)
459
- final_hidden_states = self.experts(
460
- hidden_states=hidden_states, topk_output=topk_output
461
- )
478
+ kwargs = {"hidden_states": hidden_states}
479
+ if self.topk is not None:
480
+ kwargs["topk_output"] = self.topk(hidden_states, router_logits)
481
+ else:
482
+ kwargs["router_logits"] = router_logits
483
+ final_hidden_states = self.experts(**kwargs)
462
484
  if not _is_cuda:
463
485
  final_hidden_states *= self.routed_scaling_factor
464
486
  current_stream.wait_stream(self.alt_stream)
@@ -478,10 +500,12 @@ class DeepseekV2MoE(nn.Module):
478
500
  shared_output = self._forward_shared_experts(hidden_states)
479
501
  # router_logits: (num_tokens, n_experts)
480
502
  router_logits = self.gate(hidden_states)
481
- topk_output = self.topk(hidden_states, router_logits)
482
- final_hidden_states = self.experts(
483
- hidden_states=hidden_states, topk_output=topk_output
484
- )
503
+ kwargs = {"hidden_states": hidden_states}
504
+ if self.topk is not None:
505
+ kwargs["topk_output"] = self.topk(hidden_states, router_logits)
506
+ else:
507
+ kwargs["router_logits"] = router_logits
508
+ final_hidden_states = self.experts(**kwargs)
485
509
  if not _is_cuda and not _use_aiter:
486
510
  # fused in biased_grouped_topk so we can skip here
487
511
  final_hidden_states *= self.routed_scaling_factor
@@ -901,7 +925,10 @@ class DeepseekV2AttentionMLA(nn.Module):
901
925
  self.disable_chunked_prefix_cache = global_server_args_dict[
902
926
  "disable_chunked_prefix_cache"
903
927
  ]
904
- self.attention_backend = global_server_args_dict["attention_backend"]
928
+
929
+ self.current_attention_backend = (
930
+ None # Attention backend used by current forward batch
931
+ )
905
932
  self.rocm_fused_decode_mla = get_bool_env_var(
906
933
  "SGLANG_ROCM_FUSED_DECODE_MLA", "false"
907
934
  )
@@ -985,9 +1012,16 @@ class DeepseekV2AttentionMLA(nn.Module):
985
1012
  else:
986
1013
  return AttnForwardMethod.MLA
987
1014
 
988
- if self.attention_backend == "ascend":
1015
+ # Determine attention backend used by current forward batch
1016
+ if forward_batch.forward_mode.is_decode_or_idle():
1017
+ attention_backend = global_server_args_dict["decode_attention_backend"]
1018
+ else:
1019
+ attention_backend = global_server_args_dict["prefill_attention_backend"]
1020
+ self.current_attention_backend = attention_backend
1021
+
1022
+ if attention_backend == "ascend":
989
1023
  return AttnForwardMethod.MLA
990
- elif self.attention_backend == "flashinfer":
1024
+ elif attention_backend == "flashinfer":
991
1025
  # Flashinfer MLA: Do not absorb when enabling ragged prefill
992
1026
  if (
993
1027
  not self.flashinfer_mla_disable_ragged
@@ -999,7 +1033,7 @@ class DeepseekV2AttentionMLA(nn.Module):
999
1033
  return AttnForwardMethod.MHA
1000
1034
  else:
1001
1035
  return _dispatch_mla_subtype()
1002
- elif self.attention_backend == "fa3":
1036
+ elif attention_backend == "fa3":
1003
1037
  # Flash Attention: Use MHA with chunked KV cache when prefilling on long sequences.
1004
1038
  if forward_batch.extend_prefix_lens_cpu is not None:
1005
1039
  sum_extend_prefix_lens = sum(forward_batch.extend_prefix_lens_cpu)
@@ -1016,7 +1050,7 @@ class DeepseekV2AttentionMLA(nn.Module):
1016
1050
  return AttnForwardMethod.MHA_CHUNKED_KV
1017
1051
  else:
1018
1052
  return _dispatch_mla_subtype()
1019
- elif self.attention_backend == "aiter":
1053
+ elif attention_backend == "aiter":
1020
1054
  if (
1021
1055
  forward_batch.forward_mode.is_extend()
1022
1056
  and not forward_batch.forward_mode.is_target_verify()
@@ -1264,9 +1298,9 @@ class DeepseekV2AttentionMLA(nn.Module):
1264
1298
  self, q_pe, k_pe, q_nope_out, k_nope, forward_batch, zero_allocator
1265
1299
  ):
1266
1300
  if (
1267
- self.attention_backend == "fa3"
1268
- or self.attention_backend == "flashinfer"
1269
- or self.attention_backend == "cutlass_mla"
1301
+ self.current_attention_backend == "fa3"
1302
+ or self.current_attention_backend == "flashinfer"
1303
+ or self.current_attention_backend == "cutlass_mla"
1270
1304
  ):
1271
1305
  attn_output = self.attn_mqa(
1272
1306
  q_nope_out, k_nope, k_nope, forward_batch, q_rope=q_pe, k_rope=k_pe