sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +113 -17
- sglang/srt/configs/model_config.py +35 -0
- sglang/srt/conversation.py +9 -5
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +6 -1
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
- sglang/srt/disaggregation/mooncake/conn.py +243 -135
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/distributed/parallel_state.py +11 -9
- sglang/srt/entrypoints/context.py +244 -0
- sglang/srt/entrypoints/engine.py +4 -3
- sglang/srt/entrypoints/harmony_utils.py +370 -0
- sglang/srt/entrypoints/http_server.py +71 -0
- sglang/srt/entrypoints/openai/protocol.py +227 -1
- sglang/srt/entrypoints/openai/serving_chat.py +278 -42
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +174 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/harmony_tool_parser.py +130 -0
- sglang/srt/hf_transformers_utils.py +30 -3
- sglang/srt/jinja_template_utils.py +8 -1
- sglang/srt/layers/attention/aiter_backend.py +5 -8
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/vision.py +13 -5
- sglang/srt/layers/communicator.py +21 -4
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/linear.py +2 -7
- sglang/srt/layers/moe/cutlass_moe.py +20 -6
- sglang/srt/layers/moe/ep_moe/layer.py +77 -73
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +416 -35
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
- sglang/srt/layers/moe/topk.py +12 -3
- sglang/srt/layers/moe/utils.py +16 -0
- sglang/srt/layers/quantization/__init__.py +22 -0
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +3 -6
- sglang/srt/layers/quantization/fp8_utils.py +29 -0
- sglang/srt/layers/quantization/modelopt_quant.py +259 -64
- sglang/srt/layers/quantization/mxfp4.py +651 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/__init__.py +0 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +1 -1
- sglang/srt/layers/rotary_embedding.py +225 -1
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +8 -3
- sglang/srt/lora/lora_manager.py +70 -14
- sglang/srt/lora/lora_registry.py +3 -2
- sglang/srt/lora/mem_pool.py +43 -5
- sglang/srt/managers/cache_controller.py +55 -30
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +15 -3
- sglang/srt/managers/mm_utils.py +5 -11
- sglang/srt/managers/schedule_batch.py +28 -7
- sglang/srt/managers/scheduler.py +26 -12
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +24 -6
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/mem_cache/hiradix_cache.py +53 -5
- sglang/srt/mem_cache/memory_pool_host.py +1 -1
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +7 -6
- sglang/srt/model_executor/forward_batch_info.py +35 -14
- sglang/srt/model_executor/model_runner.py +19 -2
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +72 -33
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma3n_mm.py +39 -0
- sglang/srt/models/glm4_moe.py +24 -12
- sglang/srt/models/gpt_oss.py +1134 -0
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_moe.py +6 -0
- sglang/srt/models/qwen3_moe.py +32 -6
- sglang/srt/models/step3_vl.py +9 -0
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/reasoning_parser.py +18 -39
- sglang/srt/server_args.py +142 -7
- sglang/srt/two_batch_overlap.py +157 -5
- sglang/srt/utils.py +38 -2
- sglang/test/runners.py +2 -2
- sglang/test/test_utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +16 -14
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +105 -84
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
sglang/srt/models/glm4_moe.py
CHANGED
@@ -50,11 +50,9 @@ from sglang.srt.layers.linear import (
|
|
50
50
|
RowParallelLinear,
|
51
51
|
)
|
52
52
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
53
|
-
from sglang.srt.layers.moe.ep_moe.layer import
|
54
|
-
get_moe_impl_class,
|
55
|
-
should_use_flashinfer_trtllm_moe,
|
56
|
-
)
|
53
|
+
from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
|
57
54
|
from sglang.srt.layers.moe.topk import TopK
|
55
|
+
from sglang.srt.layers.moe.utils import should_use_flashinfer_trtllm_moe
|
58
56
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
59
57
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
60
58
|
is_fp8_fnuz,
|
@@ -162,7 +160,7 @@ class Glm4MoeMLP(nn.Module):
|
|
162
160
|
|
163
161
|
gate_up, _ = self.gate_up_proj(x)
|
164
162
|
x = self.act_fn(gate_up)
|
165
|
-
x, _ = self.down_proj(x,
|
163
|
+
x, _ = self.down_proj(x, skip_all_reduce=can_fuse_mlp_allreduce)
|
166
164
|
return x
|
167
165
|
|
168
166
|
|
@@ -343,7 +341,7 @@ class Glm4MoeGate(nn.Module):
|
|
343
341
|
torch.empty((config.n_routed_experts, config.hidden_size))
|
344
342
|
)
|
345
343
|
self.e_score_correction_bias = nn.Parameter(
|
346
|
-
torch.empty((config.n_routed_experts))
|
344
|
+
torch.empty((config.n_routed_experts), dtype=torch.float32)
|
347
345
|
)
|
348
346
|
if _is_cpu and _is_cpu_amx_available:
|
349
347
|
self.quant_method = PackWeightMethod(weight_names=["weight"])
|
@@ -529,7 +527,10 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
529
527
|
self._enable_deepep_moe = global_server_args_dict["moe_a2a_backend"].is_deepep()
|
530
528
|
|
531
529
|
def forward_normal_dual_stream(
|
532
|
-
self,
|
530
|
+
self,
|
531
|
+
hidden_states: torch.Tensor,
|
532
|
+
can_fuse_mlp_allreduce: bool = False,
|
533
|
+
use_reduce_scatter: bool = False,
|
533
534
|
) -> torch.Tensor:
|
534
535
|
|
535
536
|
current_stream = torch.cuda.current_stream()
|
@@ -550,21 +551,32 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
550
551
|
current_stream.wait_stream(self.alt_stream)
|
551
552
|
|
552
553
|
if self.ep_size > 1:
|
553
|
-
if
|
554
|
+
if (
|
555
|
+
self.tp_size > 1
|
556
|
+
and not can_fuse_mlp_allreduce
|
557
|
+
and not use_reduce_scatter
|
558
|
+
):
|
554
559
|
final_hidden_states = tensor_model_parallel_all_reduce(
|
555
560
|
final_hidden_states
|
556
561
|
)
|
557
562
|
final_hidden_states += shared_output
|
558
563
|
else:
|
559
564
|
final_hidden_states += shared_output
|
560
|
-
if
|
565
|
+
if (
|
566
|
+
self.tp_size > 1
|
567
|
+
and not can_fuse_mlp_allreduce
|
568
|
+
and not use_reduce_scatter
|
569
|
+
):
|
561
570
|
final_hidden_states = tensor_model_parallel_all_reduce(
|
562
571
|
final_hidden_states
|
563
572
|
)
|
564
573
|
return final_hidden_states
|
565
574
|
|
566
575
|
def forward_normal(
|
567
|
-
self,
|
576
|
+
self,
|
577
|
+
hidden_states: torch.Tensor,
|
578
|
+
can_fuse_mlp_allreduce: bool = False,
|
579
|
+
use_reduce_scatter: bool = False,
|
568
580
|
) -> torch.Tensor:
|
569
581
|
if hasattr(self, "shared_experts") and use_intel_amx_backend(
|
570
582
|
self.shared_experts.gate_up_proj
|
@@ -683,6 +695,7 @@ class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
|
|
683
695
|
layer_scatter_modes=self.layer_scatter_modes,
|
684
696
|
input_layernorm=self.input_layernorm,
|
685
697
|
post_attention_layernorm=self.post_attention_layernorm,
|
698
|
+
allow_reduce_scatter=True,
|
686
699
|
)
|
687
700
|
|
688
701
|
def forward(
|
@@ -787,7 +800,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
|
|
787
800
|
)
|
788
801
|
|
789
802
|
def determine_num_fused_shared_experts(
|
790
|
-
self, architecture: str = "
|
803
|
+
self, architecture: str = "Glm4MoeForCausalLM"
|
791
804
|
):
|
792
805
|
self.num_fused_shared_experts = 0
|
793
806
|
if global_server_args_dict["disable_shared_experts_fusion"]:
|
@@ -799,7 +812,6 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
|
|
799
812
|
not _is_cuda
|
800
813
|
or torch.cuda.get_device_capability("cuda") < (8, 0)
|
801
814
|
or self.config.architectures[0] != architecture
|
802
|
-
or self.config.n_routed_experts != 128
|
803
815
|
or self.config.n_shared_experts != 1
|
804
816
|
):
|
805
817
|
disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
|