sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. sglang/lang/chat_template.py +21 -0
  2. sglang/srt/_custom_ops.py +29 -1
  3. sglang/srt/configs/internvl.py +3 -0
  4. sglang/srt/configs/model_config.py +5 -1
  5. sglang/srt/constrained/base_grammar_backend.py +10 -2
  6. sglang/srt/constrained/xgrammar_backend.py +7 -5
  7. sglang/srt/conversation.py +17 -2
  8. sglang/srt/debug_utils/__init__.py +0 -0
  9. sglang/srt/debug_utils/dump_comparator.py +131 -0
  10. sglang/srt/debug_utils/dumper.py +108 -0
  11. sglang/srt/debug_utils/text_comparator.py +172 -0
  12. sglang/srt/disaggregation/common/conn.py +34 -6
  13. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
  14. sglang/srt/disaggregation/mini_lb.py +3 -2
  15. sglang/srt/disaggregation/mooncake/conn.py +65 -20
  16. sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
  17. sglang/srt/disaggregation/nixl/conn.py +17 -13
  18. sglang/srt/disaggregation/prefill.py +13 -1
  19. sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
  20. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
  21. sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
  22. sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
  23. sglang/srt/distributed/parallel_state.py +70 -15
  24. sglang/srt/entrypoints/engine.py +5 -9
  25. sglang/srt/entrypoints/http_server.py +20 -32
  26. sglang/srt/entrypoints/openai/protocol.py +3 -3
  27. sglang/srt/entrypoints/openai/serving_chat.py +148 -72
  28. sglang/srt/function_call/base_format_detector.py +74 -12
  29. sglang/srt/function_call/deepseekv3_detector.py +26 -11
  30. sglang/srt/function_call/ebnf_composer.py +105 -66
  31. sglang/srt/function_call/function_call_parser.py +6 -4
  32. sglang/srt/function_call/glm4_moe_detector.py +164 -0
  33. sglang/srt/function_call/kimik2_detector.py +41 -16
  34. sglang/srt/function_call/llama32_detector.py +6 -3
  35. sglang/srt/function_call/mistral_detector.py +11 -3
  36. sglang/srt/function_call/pythonic_detector.py +16 -14
  37. sglang/srt/function_call/qwen25_detector.py +12 -3
  38. sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +11 -9
  39. sglang/srt/layers/activation.py +11 -3
  40. sglang/srt/layers/attention/base_attn_backend.py +3 -1
  41. sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
  42. sglang/srt/layers/attention/vision.py +56 -8
  43. sglang/srt/layers/communicator.py +12 -12
  44. sglang/srt/layers/dp_attention.py +72 -24
  45. sglang/srt/layers/layernorm.py +26 -1
  46. sglang/srt/layers/logits_processor.py +46 -25
  47. sglang/srt/layers/moe/ep_moe/layer.py +172 -206
  48. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  49. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  50. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
  51. sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
  52. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
  53. sglang/srt/layers/moe/topk.py +88 -34
  54. sglang/srt/layers/multimodal.py +11 -8
  55. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
  56. sglang/srt/layers/quantization/fp8.py +25 -247
  57. sglang/srt/layers/quantization/fp8_kernel.py +78 -48
  58. sglang/srt/layers/quantization/modelopt_quant.py +33 -14
  59. sglang/srt/layers/quantization/unquant.py +24 -76
  60. sglang/srt/layers/quantization/utils.py +0 -9
  61. sglang/srt/layers/quantization/w4afp8.py +68 -17
  62. sglang/srt/layers/radix_attention.py +5 -3
  63. sglang/srt/lora/lora_manager.py +133 -169
  64. sglang/srt/lora/lora_registry.py +188 -0
  65. sglang/srt/lora/mem_pool.py +2 -2
  66. sglang/srt/managers/cache_controller.py +62 -13
  67. sglang/srt/managers/io_struct.py +19 -1
  68. sglang/srt/managers/mm_utils.py +154 -35
  69. sglang/srt/managers/multimodal_processor.py +3 -14
  70. sglang/srt/managers/schedule_batch.py +27 -11
  71. sglang/srt/managers/scheduler.py +48 -26
  72. sglang/srt/managers/tokenizer_manager.py +62 -28
  73. sglang/srt/managers/tp_worker.py +5 -4
  74. sglang/srt/mem_cache/allocator.py +67 -7
  75. sglang/srt/mem_cache/hicache_storage.py +17 -1
  76. sglang/srt/mem_cache/hiradix_cache.py +35 -18
  77. sglang/srt/mem_cache/memory_pool_host.py +3 -0
  78. sglang/srt/model_executor/cuda_graph_runner.py +61 -25
  79. sglang/srt/model_executor/forward_batch_info.py +201 -29
  80. sglang/srt/model_executor/model_runner.py +109 -37
  81. sglang/srt/models/deepseek_v2.py +63 -30
  82. sglang/srt/models/glm4_moe.py +1035 -0
  83. sglang/srt/models/glm4_moe_nextn.py +167 -0
  84. sglang/srt/models/interns1.py +328 -0
  85. sglang/srt/models/internvl.py +143 -47
  86. sglang/srt/models/llava.py +9 -5
  87. sglang/srt/models/minicpmo.py +4 -1
  88. sglang/srt/models/mllama4.py +10 -3
  89. sglang/srt/models/qwen2_moe.py +2 -6
  90. sglang/srt/models/qwen3_moe.py +6 -8
  91. sglang/srt/multimodal/processors/base_processor.py +20 -6
  92. sglang/srt/multimodal/processors/clip.py +2 -2
  93. sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
  94. sglang/srt/multimodal/processors/gemma3.py +2 -2
  95. sglang/srt/multimodal/processors/gemma3n.py +2 -2
  96. sglang/srt/multimodal/processors/internvl.py +21 -8
  97. sglang/srt/multimodal/processors/janus_pro.py +2 -2
  98. sglang/srt/multimodal/processors/kimi_vl.py +2 -2
  99. sglang/srt/multimodal/processors/llava.py +4 -4
  100. sglang/srt/multimodal/processors/minicpm.py +2 -3
  101. sglang/srt/multimodal/processors/mlama.py +2 -2
  102. sglang/srt/multimodal/processors/mllama4.py +18 -111
  103. sglang/srt/multimodal/processors/phi4mm.py +2 -2
  104. sglang/srt/multimodal/processors/pixtral.py +2 -2
  105. sglang/srt/multimodal/processors/qwen_audio.py +2 -2
  106. sglang/srt/multimodal/processors/qwen_vl.py +2 -2
  107. sglang/srt/multimodal/processors/vila.py +3 -1
  108. sglang/srt/reasoning_parser.py +48 -5
  109. sglang/srt/sampling/sampling_batch_info.py +6 -5
  110. sglang/srt/server_args.py +132 -60
  111. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
  112. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
  113. sglang/srt/speculative/eagle_utils.py +51 -23
  114. sglang/srt/speculative/eagle_worker.py +59 -44
  115. sglang/srt/two_batch_overlap.py +9 -5
  116. sglang/srt/utils.py +113 -69
  117. sglang/srt/weight_sync/utils.py +119 -0
  118. sglang/test/runners.py +4 -0
  119. sglang/test/test_activation.py +50 -1
  120. sglang/test/test_utils.py +65 -5
  121. sglang/utils.py +19 -0
  122. sglang/version.py +1 -1
  123. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +6 -6
  124. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +127 -114
  125. sglang/srt/debug_utils.py +0 -74
  126. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
  127. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
  128. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,9 @@ from sglang.srt.distributed import (
27
27
  tensor_model_parallel_all_gather,
28
28
  )
29
29
  from sglang.srt.layers.dp_attention import (
30
+ DPPaddingMode,
30
31
  attn_tp_all_gather,
32
+ attn_tp_all_gather_into_tensor,
31
33
  dp_gather_replicate,
32
34
  dp_scatter,
33
35
  get_attention_dp_rank,
@@ -111,7 +113,8 @@ class LogitsMetadata:
111
113
  # Number of tokens to sample per DP rank
112
114
  global_num_tokens_for_logprob_cpu: Optional[torch.Tensor] = None
113
115
  global_num_tokens_for_logprob_gpu: Optional[torch.Tensor] = None
114
-
116
+ # The gather mode for DP attention
117
+ dp_padding_mode: Optional[DPPaddingMode] = None
115
118
  # for padding
116
119
  padded_static_len: int = -1
117
120
 
@@ -163,12 +166,10 @@ class LogitsMetadata:
163
166
  forward_batch_gathered_buffer=forward_batch.gathered_buffer,
164
167
  global_num_tokens_for_logprob_cpu=forward_batch.global_num_tokens_for_logprob_cpu,
165
168
  global_num_tokens_for_logprob_gpu=forward_batch.global_num_tokens_for_logprob_gpu,
169
+ dp_padding_mode=DPPaddingMode.SUM_LEN,
166
170
  )
167
171
 
168
- def compute_dp_attention_metadata(self, hidden_states: torch.Tensor):
169
- if self.global_num_tokens_for_logprob_cpu is None:
170
- # we are capturing cuda graph
171
- return
172
+ def compute_dp_attention_metadata(self):
172
173
 
173
174
  cumtokens = torch.cumsum(self.global_num_tokens_for_logprob_gpu, dim=0)
174
175
  dp_rank = get_attention_dp_rank()
@@ -179,18 +180,22 @@ class LogitsMetadata:
179
180
  else:
180
181
  dp_local_start_pos = cumtokens[dp_rank - 1]
181
182
  dp_local_num_tokens = self.global_num_tokens_for_logprob_gpu[dp_rank]
182
- gathered_buffer = torch.zeros(
183
- (
184
- sum(self.global_num_tokens_for_logprob_cpu),
185
- hidden_states.shape[1],
186
- ),
187
- dtype=hidden_states.dtype,
188
- device=hidden_states.device,
189
- )
190
183
 
191
184
  self.dp_local_start_pos = dp_local_start_pos
192
185
  self.dp_local_num_tokens = dp_local_num_tokens
193
- self.gathered_buffer = gathered_buffer
186
+
187
+ if self.global_num_tokens_for_logprob_cpu is not None:
188
+ # create a smaller buffer to reduce peak memory usage
189
+ self.gathered_buffer = torch.empty(
190
+ (
191
+ sum(self.global_num_tokens_for_logprob_cpu),
192
+ self.gathered_buffer.shape[1],
193
+ ),
194
+ dtype=self.gathered_buffer.dtype,
195
+ device=self.gathered_buffer.device,
196
+ )
197
+ else:
198
+ self.gathered_buffer = torch.empty_like(self.gathered_buffer)
194
199
 
195
200
 
196
201
  class LogitsProcessor(nn.Module):
@@ -434,9 +439,9 @@ class LogitsProcessor(nn.Module):
434
439
  guarantee the given hidden_states follow this constraint.
435
440
  """
436
441
  if self.do_tensor_parallel_all_gather_dp_attn:
437
- logits_metadata.compute_dp_attention_metadata(hidden_states)
442
+ logits_metadata.compute_dp_attention_metadata()
438
443
  hidden_states, local_hidden_states = (
439
- torch.empty_like(logits_metadata.gathered_buffer),
444
+ logits_metadata.gathered_buffer,
440
445
  hidden_states,
441
446
  )
442
447
  dp_gather_replicate(hidden_states, local_hidden_states, logits_metadata)
@@ -463,15 +468,31 @@ class LogitsProcessor(nn.Module):
463
468
 
464
469
  if self.do_tensor_parallel_all_gather:
465
470
  if self.use_attn_tp_group:
466
- global_logits = torch.empty(
467
- (self.config.vocab_size, logits.shape[0]),
468
- device=logits.device,
469
- dtype=logits.dtype,
470
- )
471
- global_logits = global_logits.T
472
- attn_tp_all_gather(
473
- list(global_logits.tensor_split(self.attn_tp_size, dim=-1)), logits
474
- )
471
+ if self.config.vocab_size % self.attn_tp_size == 0:
472
+ global_logits = torch.empty(
473
+ (
474
+ self.attn_tp_size,
475
+ logits.shape[0],
476
+ self.config.vocab_size // self.attn_tp_size,
477
+ ),
478
+ device=logits.device,
479
+ dtype=logits.dtype,
480
+ )
481
+ attn_tp_all_gather_into_tensor(global_logits, logits)
482
+ global_logits = global_logits.permute(1, 0, 2).reshape(
483
+ logits.shape[0], self.config.vocab_size
484
+ )
485
+ else:
486
+ global_logits = torch.empty(
487
+ (self.config.vocab_size, logits.shape[0]),
488
+ device=logits.device,
489
+ dtype=logits.dtype,
490
+ )
491
+ global_logits = global_logits.T
492
+ attn_tp_all_gather(
493
+ list(global_logits.tensor_split(self.attn_tp_size, dim=-1)),
494
+ logits,
495
+ )
475
496
  logits = global_logits
476
497
  else:
477
498
  logits = tensor_model_parallel_all_gather(logits)