sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +119 -17
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +42 -7
- sglang/srt/conversation.py +9 -5
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +14 -4
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
- sglang/srt/disaggregation/mooncake/conn.py +286 -160
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/distributed/parallel_state.py +15 -11
- sglang/srt/entrypoints/context.py +227 -0
- sglang/srt/entrypoints/engine.py +15 -9
- sglang/srt/entrypoints/harmony_utils.py +372 -0
- sglang/srt/entrypoints/http_server.py +74 -4
- sglang/srt/entrypoints/openai/protocol.py +218 -1
- sglang/srt/entrypoints/openai/serving_chat.py +41 -11
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +175 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/hf_transformers_utils.py +30 -3
- sglang/srt/jinja_template_utils.py +14 -1
- sglang/srt/layers/attention/aiter_backend.py +375 -115
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +52 -13
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
- sglang/srt/layers/attention/vision.py +22 -6
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +29 -14
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +3 -7
- sglang/srt/layers/moe/cutlass_moe.py +12 -3
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +135 -73
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +16 -4
- sglang/srt/layers/moe/utils.py +16 -0
- sglang/srt/layers/quantization/__init__.py +27 -3
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +3 -6
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +51 -10
- sglang/srt/layers/quantization/modelopt_quant.py +258 -68
- sglang/srt/layers/quantization/mxfp4.py +654 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +21 -12
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +506 -3
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +8 -3
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +60 -114
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +82 -62
- sglang/srt/lora/lora_registry.py +23 -11
- sglang/srt/lora/mem_pool.py +63 -68
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +75 -58
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +20 -8
- sglang/srt/managers/mm_utils.py +6 -13
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +61 -25
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +41 -19
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +47 -30
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/mem_cache/allocator.py +61 -87
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +80 -22
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +34 -36
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +29 -9
- sglang/srt/model_executor/forward_batch_info.py +61 -19
- sglang/srt/model_executor/model_runner.py +148 -37
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +137 -59
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +38 -0
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +28 -16
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +1251 -0
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +0 -25
- sglang/srt/models/llama4.py +1 -1
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen2_moe.py +6 -0
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/qwen3_moe.py +32 -6
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/step3_vl.py +9 -0
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/reasoning_parser.py +332 -37
- sglang/srt/server_args.py +186 -75
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +169 -9
- sglang/srt/utils.py +41 -5
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/runners.py +2 -2
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/test/test_utils.py +1 -1
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
sglang/srt/models/gemma3n_mm.py
CHANGED
@@ -492,5 +492,43 @@ class Gemma3nForConditionalGeneration(PreTrainedModel):
|
|
492
492
|
loaded_params.add(name)
|
493
493
|
return loaded_params
|
494
494
|
|
495
|
+
lora_pattern = re.compile(
|
496
|
+
r"^language_model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)"
|
497
|
+
)
|
498
|
+
|
499
|
+
def should_apply_lora(self, module_name: str) -> bool:
|
500
|
+
return bool(self.lora_pattern.match(module_name))
|
501
|
+
|
502
|
+
def get_hidden_dim(self, module_name):
|
503
|
+
# return input_dim, output_dim
|
504
|
+
if module_name == "qkv_proj":
|
505
|
+
return (
|
506
|
+
self.config.hidden_size,
|
507
|
+
self.config.head_dim
|
508
|
+
* (
|
509
|
+
self.config.num_attention_heads
|
510
|
+
+ self.config.num_key_value_heads * 2
|
511
|
+
),
|
512
|
+
)
|
513
|
+
elif module_name == "o_proj":
|
514
|
+
return (
|
515
|
+
self.config.head_dim * self.config.num_attention_heads,
|
516
|
+
self.config.hidden_size,
|
517
|
+
)
|
518
|
+
elif module_name == "gate_up_proj":
|
519
|
+
assert len(set(self.config.intermediate_size)) == 1, (
|
520
|
+
"Currently SGLang requires uniform intermediate size for all layers. "
|
521
|
+
"Please file an issue if you need support for non-uniform intermediate sizes."
|
522
|
+
)
|
523
|
+
return self.config.hidden_size, self.config.intermediate_size[0] * 2
|
524
|
+
elif module_name == "down_proj":
|
525
|
+
assert len(set(self.config.intermediate_size)) == 1, (
|
526
|
+
"Currently SGLang requires uniform intermediate size for all layers. "
|
527
|
+
"Please file an issue if you need support for non-uniform intermediate sizes."
|
528
|
+
)
|
529
|
+
return self.config.intermediate_size[0], self.config.hidden_size
|
530
|
+
else:
|
531
|
+
raise NotImplementedError()
|
532
|
+
|
495
533
|
|
496
534
|
EntryClass = Gemma3nForConditionalGeneration
|
sglang/srt/models/glm4.py
CHANGED
@@ -218,6 +218,12 @@ class Glm4Model(nn.Module):
|
|
218
218
|
|
219
219
|
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
220
220
|
|
221
|
+
def get_input_embeddings(self) -> nn.Embedding:
|
222
|
+
return self.embed_tokens
|
223
|
+
|
224
|
+
def dtype(self) -> torch.dtype:
|
225
|
+
return next(self.parameters()).dtype
|
226
|
+
|
221
227
|
@torch.no_grad()
|
222
228
|
def forward(
|
223
229
|
self,
|
sglang/srt/models/glm4_moe.py
CHANGED
@@ -50,11 +50,9 @@ from sglang.srt.layers.linear import (
|
|
50
50
|
RowParallelLinear,
|
51
51
|
)
|
52
52
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
53
|
-
from sglang.srt.layers.moe.ep_moe.layer import
|
54
|
-
get_moe_impl_class,
|
55
|
-
should_use_flashinfer_trtllm_moe,
|
56
|
-
)
|
53
|
+
from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
|
57
54
|
from sglang.srt.layers.moe.topk import TopK
|
55
|
+
from sglang.srt.layers.moe.utils import should_use_flashinfer_trtllm_moe
|
58
56
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
59
57
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
60
58
|
is_fp8_fnuz,
|
@@ -156,13 +154,13 @@ class Glm4MoeMLP(nn.Module):
|
|
156
154
|
)
|
157
155
|
self.act_fn = SiluAndMul()
|
158
156
|
|
159
|
-
def forward(self, x, forward_batch=None,
|
157
|
+
def forward(self, x, forward_batch=None, should_allreduce_fusion=False):
|
160
158
|
if (self.tp_size == 1) and x.shape[0] == 0:
|
161
159
|
return x
|
162
160
|
|
163
161
|
gate_up, _ = self.gate_up_proj(x)
|
164
162
|
x = self.act_fn(gate_up)
|
165
|
-
x, _ = self.down_proj(x,
|
163
|
+
x, _ = self.down_proj(x, skip_all_reduce=should_allreduce_fusion)
|
166
164
|
return x
|
167
165
|
|
168
166
|
|
@@ -343,7 +341,7 @@ class Glm4MoeGate(nn.Module):
|
|
343
341
|
torch.empty((config.n_routed_experts, config.hidden_size))
|
344
342
|
)
|
345
343
|
self.e_score_correction_bias = nn.Parameter(
|
346
|
-
torch.empty((config.n_routed_experts))
|
344
|
+
torch.empty((config.n_routed_experts), dtype=torch.float32)
|
347
345
|
)
|
348
346
|
if _is_cpu and _is_cpu_amx_available:
|
349
347
|
self.quant_method = PackWeightMethod(weight_names=["weight"])
|
@@ -529,7 +527,10 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
529
527
|
self._enable_deepep_moe = global_server_args_dict["moe_a2a_backend"].is_deepep()
|
530
528
|
|
531
529
|
def forward_normal_dual_stream(
|
532
|
-
self,
|
530
|
+
self,
|
531
|
+
hidden_states: torch.Tensor,
|
532
|
+
should_allreduce_fusion: bool = False,
|
533
|
+
use_reduce_scatter: bool = False,
|
533
534
|
) -> torch.Tensor:
|
534
535
|
|
535
536
|
current_stream = torch.cuda.current_stream()
|
@@ -550,26 +551,37 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
550
551
|
current_stream.wait_stream(self.alt_stream)
|
551
552
|
|
552
553
|
if self.ep_size > 1:
|
553
|
-
if
|
554
|
+
if (
|
555
|
+
self.tp_size > 1
|
556
|
+
and not should_allreduce_fusion
|
557
|
+
and not use_reduce_scatter
|
558
|
+
):
|
554
559
|
final_hidden_states = tensor_model_parallel_all_reduce(
|
555
560
|
final_hidden_states
|
556
561
|
)
|
557
562
|
final_hidden_states += shared_output
|
558
563
|
else:
|
559
564
|
final_hidden_states += shared_output
|
560
|
-
if
|
565
|
+
if (
|
566
|
+
self.tp_size > 1
|
567
|
+
and not should_allreduce_fusion
|
568
|
+
and not use_reduce_scatter
|
569
|
+
):
|
561
570
|
final_hidden_states = tensor_model_parallel_all_reduce(
|
562
571
|
final_hidden_states
|
563
572
|
)
|
564
573
|
return final_hidden_states
|
565
574
|
|
566
575
|
def forward_normal(
|
567
|
-
self,
|
576
|
+
self,
|
577
|
+
hidden_states: torch.Tensor,
|
578
|
+
should_allreduce_fusion: bool = False,
|
579
|
+
use_reduce_scatter: bool = False,
|
568
580
|
) -> torch.Tensor:
|
569
581
|
if hasattr(self, "shared_experts") and use_intel_amx_backend(
|
570
582
|
self.shared_experts.gate_up_proj
|
571
583
|
):
|
572
|
-
return self.forward_cpu(hidden_states,
|
584
|
+
return self.forward_cpu(hidden_states, should_allreduce_fusion)
|
573
585
|
|
574
586
|
shared_output = self._forward_shared_experts(hidden_states)
|
575
587
|
# router_logits: (num_tokens, n_experts)
|
@@ -584,7 +596,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
584
596
|
# fused in biased_grouped_topk so we can skip here
|
585
597
|
final_hidden_states *= self.routed_scaling_factor
|
586
598
|
if self.ep_size > 1:
|
587
|
-
if self.tp_size > 1 and not
|
599
|
+
if self.tp_size > 1 and not should_allreduce_fusion:
|
588
600
|
final_hidden_states = tensor_model_parallel_all_reduce(
|
589
601
|
final_hidden_states
|
590
602
|
)
|
@@ -593,7 +605,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
593
605
|
else:
|
594
606
|
if shared_output is not None:
|
595
607
|
final_hidden_states += shared_output
|
596
|
-
if self.tp_size > 1 and not
|
608
|
+
if self.tp_size > 1 and not should_allreduce_fusion:
|
597
609
|
final_hidden_states = tensor_model_parallel_all_reduce(
|
598
610
|
final_hidden_states
|
599
611
|
)
|
@@ -683,6 +695,7 @@ class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
|
|
683
695
|
layer_scatter_modes=self.layer_scatter_modes,
|
684
696
|
input_layernorm=self.input_layernorm,
|
685
697
|
post_attention_layernorm=self.post_attention_layernorm,
|
698
|
+
allow_reduce_scatter=True,
|
686
699
|
)
|
687
700
|
|
688
701
|
def forward(
|
@@ -787,7 +800,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
|
|
787
800
|
)
|
788
801
|
|
789
802
|
def determine_num_fused_shared_experts(
|
790
|
-
self, architecture: str = "
|
803
|
+
self, architecture: str = "Glm4MoeForCausalLM"
|
791
804
|
):
|
792
805
|
self.num_fused_shared_experts = 0
|
793
806
|
if global_server_args_dict["disable_shared_experts_fusion"]:
|
@@ -799,7 +812,6 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
|
|
799
812
|
not _is_cuda
|
800
813
|
or torch.cuda.get_device_capability("cuda") < (8, 0)
|
801
814
|
or self.config.architectures[0] != architecture
|
802
|
-
or self.config.n_routed_experts != 128
|
803
815
|
or self.config.n_shared_experts != 1
|
804
816
|
):
|
805
817
|
disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
|