sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/chat_template.py +21 -0
- sglang/srt/_custom_ops.py +29 -1
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/model_config.py +5 -1
- sglang/srt/constrained/base_grammar_backend.py +10 -2
- sglang/srt/constrained/xgrammar_backend.py +7 -5
- sglang/srt/conversation.py +17 -2
- sglang/srt/debug_utils/__init__.py +0 -0
- sglang/srt/debug_utils/dump_comparator.py +131 -0
- sglang/srt/debug_utils/dumper.py +108 -0
- sglang/srt/debug_utils/text_comparator.py +172 -0
- sglang/srt/disaggregation/common/conn.py +34 -6
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
- sglang/srt/disaggregation/mini_lb.py +3 -2
- sglang/srt/disaggregation/mooncake/conn.py +65 -20
- sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
- sglang/srt/disaggregation/nixl/conn.py +17 -13
- sglang/srt/disaggregation/prefill.py +13 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
- sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
- sglang/srt/distributed/parallel_state.py +70 -15
- sglang/srt/entrypoints/engine.py +5 -9
- sglang/srt/entrypoints/http_server.py +20 -32
- sglang/srt/entrypoints/openai/protocol.py +3 -3
- sglang/srt/entrypoints/openai/serving_chat.py +148 -72
- sglang/srt/function_call/base_format_detector.py +74 -12
- sglang/srt/function_call/deepseekv3_detector.py +26 -11
- sglang/srt/function_call/ebnf_composer.py +105 -66
- sglang/srt/function_call/function_call_parser.py +6 -4
- sglang/srt/function_call/glm4_moe_detector.py +164 -0
- sglang/srt/function_call/kimik2_detector.py +41 -16
- sglang/srt/function_call/llama32_detector.py +6 -3
- sglang/srt/function_call/mistral_detector.py +11 -3
- sglang/srt/function_call/pythonic_detector.py +16 -14
- sglang/srt/function_call/qwen25_detector.py +12 -3
- sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +11 -9
- sglang/srt/layers/activation.py +11 -3
- sglang/srt/layers/attention/base_attn_backend.py +3 -1
- sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
- sglang/srt/layers/attention/vision.py +56 -8
- sglang/srt/layers/communicator.py +12 -12
- sglang/srt/layers/dp_attention.py +72 -24
- sglang/srt/layers/layernorm.py +26 -1
- sglang/srt/layers/logits_processor.py +46 -25
- sglang/srt/layers/moe/ep_moe/layer.py +172 -206
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
- sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
- sglang/srt/layers/moe/topk.py +88 -34
- sglang/srt/layers/multimodal.py +11 -8
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
- sglang/srt/layers/quantization/fp8.py +25 -247
- sglang/srt/layers/quantization/fp8_kernel.py +78 -48
- sglang/srt/layers/quantization/modelopt_quant.py +33 -14
- sglang/srt/layers/quantization/unquant.py +24 -76
- sglang/srt/layers/quantization/utils.py +0 -9
- sglang/srt/layers/quantization/w4afp8.py +68 -17
- sglang/srt/layers/radix_attention.py +5 -3
- sglang/srt/lora/lora_manager.py +133 -169
- sglang/srt/lora/lora_registry.py +188 -0
- sglang/srt/lora/mem_pool.py +2 -2
- sglang/srt/managers/cache_controller.py +62 -13
- sglang/srt/managers/io_struct.py +19 -1
- sglang/srt/managers/mm_utils.py +154 -35
- sglang/srt/managers/multimodal_processor.py +3 -14
- sglang/srt/managers/schedule_batch.py +27 -11
- sglang/srt/managers/scheduler.py +48 -26
- sglang/srt/managers/tokenizer_manager.py +62 -28
- sglang/srt/managers/tp_worker.py +5 -4
- sglang/srt/mem_cache/allocator.py +67 -7
- sglang/srt/mem_cache/hicache_storage.py +17 -1
- sglang/srt/mem_cache/hiradix_cache.py +35 -18
- sglang/srt/mem_cache/memory_pool_host.py +3 -0
- sglang/srt/model_executor/cuda_graph_runner.py +61 -25
- sglang/srt/model_executor/forward_batch_info.py +201 -29
- sglang/srt/model_executor/model_runner.py +109 -37
- sglang/srt/models/deepseek_v2.py +63 -30
- sglang/srt/models/glm4_moe.py +1035 -0
- sglang/srt/models/glm4_moe_nextn.py +167 -0
- sglang/srt/models/interns1.py +328 -0
- sglang/srt/models/internvl.py +143 -47
- sglang/srt/models/llava.py +9 -5
- sglang/srt/models/minicpmo.py +4 -1
- sglang/srt/models/mllama4.py +10 -3
- sglang/srt/models/qwen2_moe.py +2 -6
- sglang/srt/models/qwen3_moe.py +6 -8
- sglang/srt/multimodal/processors/base_processor.py +20 -6
- sglang/srt/multimodal/processors/clip.py +2 -2
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
- sglang/srt/multimodal/processors/gemma3.py +2 -2
- sglang/srt/multimodal/processors/gemma3n.py +2 -2
- sglang/srt/multimodal/processors/internvl.py +21 -8
- sglang/srt/multimodal/processors/janus_pro.py +2 -2
- sglang/srt/multimodal/processors/kimi_vl.py +2 -2
- sglang/srt/multimodal/processors/llava.py +4 -4
- sglang/srt/multimodal/processors/minicpm.py +2 -3
- sglang/srt/multimodal/processors/mlama.py +2 -2
- sglang/srt/multimodal/processors/mllama4.py +18 -111
- sglang/srt/multimodal/processors/phi4mm.py +2 -2
- sglang/srt/multimodal/processors/pixtral.py +2 -2
- sglang/srt/multimodal/processors/qwen_audio.py +2 -2
- sglang/srt/multimodal/processors/qwen_vl.py +2 -2
- sglang/srt/multimodal/processors/vila.py +3 -1
- sglang/srt/reasoning_parser.py +48 -5
- sglang/srt/sampling/sampling_batch_info.py +6 -5
- sglang/srt/server_args.py +132 -60
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
- sglang/srt/speculative/eagle_utils.py +51 -23
- sglang/srt/speculative/eagle_worker.py +59 -44
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils.py +113 -69
- sglang/srt/weight_sync/utils.py +119 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_activation.py +50 -1
- sglang/test/test_utils.py +65 -5
- sglang/utils.py +19 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +6 -6
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +127 -114
- sglang/srt/debug_utils.py +0 -74
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,9 @@ from sglang.srt.distributed import (
|
|
27
27
|
tensor_model_parallel_all_gather,
|
28
28
|
)
|
29
29
|
from sglang.srt.layers.dp_attention import (
|
30
|
+
DPPaddingMode,
|
30
31
|
attn_tp_all_gather,
|
32
|
+
attn_tp_all_gather_into_tensor,
|
31
33
|
dp_gather_replicate,
|
32
34
|
dp_scatter,
|
33
35
|
get_attention_dp_rank,
|
@@ -111,7 +113,8 @@ class LogitsMetadata:
|
|
111
113
|
# Number of tokens to sample per DP rank
|
112
114
|
global_num_tokens_for_logprob_cpu: Optional[torch.Tensor] = None
|
113
115
|
global_num_tokens_for_logprob_gpu: Optional[torch.Tensor] = None
|
114
|
-
|
116
|
+
# The gather mode for DP attention
|
117
|
+
dp_padding_mode: Optional[DPPaddingMode] = None
|
115
118
|
# for padding
|
116
119
|
padded_static_len: int = -1
|
117
120
|
|
@@ -163,12 +166,10 @@ class LogitsMetadata:
|
|
163
166
|
forward_batch_gathered_buffer=forward_batch.gathered_buffer,
|
164
167
|
global_num_tokens_for_logprob_cpu=forward_batch.global_num_tokens_for_logprob_cpu,
|
165
168
|
global_num_tokens_for_logprob_gpu=forward_batch.global_num_tokens_for_logprob_gpu,
|
169
|
+
dp_padding_mode=DPPaddingMode.SUM_LEN,
|
166
170
|
)
|
167
171
|
|
168
|
-
def compute_dp_attention_metadata(self
|
169
|
-
if self.global_num_tokens_for_logprob_cpu is None:
|
170
|
-
# we are capturing cuda graph
|
171
|
-
return
|
172
|
+
def compute_dp_attention_metadata(self):
|
172
173
|
|
173
174
|
cumtokens = torch.cumsum(self.global_num_tokens_for_logprob_gpu, dim=0)
|
174
175
|
dp_rank = get_attention_dp_rank()
|
@@ -179,18 +180,22 @@ class LogitsMetadata:
|
|
179
180
|
else:
|
180
181
|
dp_local_start_pos = cumtokens[dp_rank - 1]
|
181
182
|
dp_local_num_tokens = self.global_num_tokens_for_logprob_gpu[dp_rank]
|
182
|
-
gathered_buffer = torch.zeros(
|
183
|
-
(
|
184
|
-
sum(self.global_num_tokens_for_logprob_cpu),
|
185
|
-
hidden_states.shape[1],
|
186
|
-
),
|
187
|
-
dtype=hidden_states.dtype,
|
188
|
-
device=hidden_states.device,
|
189
|
-
)
|
190
183
|
|
191
184
|
self.dp_local_start_pos = dp_local_start_pos
|
192
185
|
self.dp_local_num_tokens = dp_local_num_tokens
|
193
|
-
|
186
|
+
|
187
|
+
if self.global_num_tokens_for_logprob_cpu is not None:
|
188
|
+
# create a smaller buffer to reduce peak memory usage
|
189
|
+
self.gathered_buffer = torch.empty(
|
190
|
+
(
|
191
|
+
sum(self.global_num_tokens_for_logprob_cpu),
|
192
|
+
self.gathered_buffer.shape[1],
|
193
|
+
),
|
194
|
+
dtype=self.gathered_buffer.dtype,
|
195
|
+
device=self.gathered_buffer.device,
|
196
|
+
)
|
197
|
+
else:
|
198
|
+
self.gathered_buffer = torch.empty_like(self.gathered_buffer)
|
194
199
|
|
195
200
|
|
196
201
|
class LogitsProcessor(nn.Module):
|
@@ -434,9 +439,9 @@ class LogitsProcessor(nn.Module):
|
|
434
439
|
guarantee the given hidden_states follow this constraint.
|
435
440
|
"""
|
436
441
|
if self.do_tensor_parallel_all_gather_dp_attn:
|
437
|
-
logits_metadata.compute_dp_attention_metadata(
|
442
|
+
logits_metadata.compute_dp_attention_metadata()
|
438
443
|
hidden_states, local_hidden_states = (
|
439
|
-
|
444
|
+
logits_metadata.gathered_buffer,
|
440
445
|
hidden_states,
|
441
446
|
)
|
442
447
|
dp_gather_replicate(hidden_states, local_hidden_states, logits_metadata)
|
@@ -463,15 +468,31 @@ class LogitsProcessor(nn.Module):
|
|
463
468
|
|
464
469
|
if self.do_tensor_parallel_all_gather:
|
465
470
|
if self.use_attn_tp_group:
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
471
|
+
if self.config.vocab_size % self.attn_tp_size == 0:
|
472
|
+
global_logits = torch.empty(
|
473
|
+
(
|
474
|
+
self.attn_tp_size,
|
475
|
+
logits.shape[0],
|
476
|
+
self.config.vocab_size // self.attn_tp_size,
|
477
|
+
),
|
478
|
+
device=logits.device,
|
479
|
+
dtype=logits.dtype,
|
480
|
+
)
|
481
|
+
attn_tp_all_gather_into_tensor(global_logits, logits)
|
482
|
+
global_logits = global_logits.permute(1, 0, 2).reshape(
|
483
|
+
logits.shape[0], self.config.vocab_size
|
484
|
+
)
|
485
|
+
else:
|
486
|
+
global_logits = torch.empty(
|
487
|
+
(self.config.vocab_size, logits.shape[0]),
|
488
|
+
device=logits.device,
|
489
|
+
dtype=logits.dtype,
|
490
|
+
)
|
491
|
+
global_logits = global_logits.T
|
492
|
+
attn_tp_all_gather(
|
493
|
+
list(global_logits.tensor_split(self.attn_tp_size, dim=-1)),
|
494
|
+
logits,
|
495
|
+
)
|
475
496
|
logits = global_logits
|
476
497
|
else:
|
477
498
|
logits = tensor_model_parallel_all_gather(logits)
|