sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +113 -17
- sglang/compile_deep_gemm.py +8 -1
- sglang/global_config.py +5 -1
- sglang/srt/configs/model_config.py +35 -0
- sglang/srt/conversation.py +9 -117
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +6 -1
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
- sglang/srt/disaggregation/mooncake/conn.py +243 -135
- sglang/srt/disaggregation/prefill.py +3 -0
- sglang/srt/distributed/device_communicators/pynccl.py +7 -0
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
- sglang/srt/distributed/parallel_state.py +22 -9
- sglang/srt/entrypoints/context.py +244 -0
- sglang/srt/entrypoints/engine.py +8 -5
- sglang/srt/entrypoints/harmony_utils.py +370 -0
- sglang/srt/entrypoints/http_server.py +106 -15
- sglang/srt/entrypoints/openai/protocol.py +227 -1
- sglang/srt/entrypoints/openai/serving_chat.py +278 -42
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +174 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_distribution.py +4 -2
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/harmony_tool_parser.py +130 -0
- sglang/srt/hf_transformers_utils.py +55 -13
- sglang/srt/jinja_template_utils.py +8 -1
- sglang/srt/layers/attention/aiter_backend.py +5 -8
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +7 -11
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
- sglang/srt/layers/attention/vision.py +40 -15
- sglang/srt/layers/communicator.py +35 -8
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/linear.py +9 -8
- sglang/srt/layers/logits_processor.py +9 -1
- sglang/srt/layers/moe/cutlass_moe.py +20 -6
- sglang/srt/layers/moe/ep_moe/layer.py +87 -107
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
- sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
- sglang/srt/layers/moe/topk.py +12 -3
- sglang/srt/layers/moe/utils.py +59 -0
- sglang/srt/layers/quantization/__init__.py +22 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +8 -7
- sglang/srt/layers/quantization/fp8_kernel.py +0 -4
- sglang/srt/layers/quantization/fp8_utils.py +29 -0
- sglang/srt/layers/quantization/modelopt_quant.py +259 -64
- sglang/srt/layers/quantization/mxfp4.py +651 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/__init__.py +0 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +1 -1
- sglang/srt/layers/rotary_embedding.py +225 -1
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +15 -4
- sglang/srt/lora/lora_manager.py +70 -14
- sglang/srt/lora/lora_registry.py +10 -2
- sglang/srt/lora/mem_pool.py +43 -5
- sglang/srt/managers/cache_controller.py +61 -32
- sglang/srt/managers/data_parallel_controller.py +52 -2
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +21 -4
- sglang/srt/managers/mm_utils.py +5 -11
- sglang/srt/managers/schedule_batch.py +30 -8
- sglang/srt/managers/schedule_policy.py +3 -1
- sglang/srt/managers/scheduler.py +170 -18
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +59 -22
- sglang/srt/managers/tokenizer_manager.py +137 -67
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/managers/utils.py +45 -1
- sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
- sglang/srt/mem_cache/hicache_storage.py +13 -21
- sglang/srt/mem_cache/hiradix_cache.py +53 -5
- sglang/srt/mem_cache/memory_pool_host.py +1 -1
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
- sglang/srt/model_executor/cuda_graph_runner.py +24 -9
- sglang/srt/model_executor/forward_batch_info.py +48 -17
- sglang/srt/model_executor/model_runner.py +24 -2
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +95 -50
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma3n_mm.py +39 -0
- sglang/srt/models/glm4_moe.py +102 -27
- sglang/srt/models/gpt_oss.py +1134 -0
- sglang/srt/models/grok.py +3 -3
- sglang/srt/models/llama4.py +13 -2
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mllama4.py +428 -19
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_moe.py +7 -4
- sglang/srt/models/qwen3_moe.py +39 -14
- sglang/srt/models/step3_vl.py +10 -1
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +4 -3
- sglang/srt/multimodal/processors/gemma3n.py +0 -7
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/operations_strategy.py +1 -1
- sglang/srt/reasoning_parser.py +18 -39
- sglang/srt/server_args.py +218 -23
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
- sglang/srt/two_batch_overlap.py +163 -9
- sglang/srt/utils.py +41 -26
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/runners.py +4 -4
- sglang/test/test_utils.py +4 -4
- sglang/version.py +1 -1
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
- /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
- /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
sglang/srt/models/glm4_moe.py
CHANGED
@@ -23,6 +23,7 @@ from torch import nn
|
|
23
23
|
from transformers import PretrainedConfig
|
24
24
|
|
25
25
|
from sglang.srt.distributed import (
|
26
|
+
get_moe_expert_parallel_world_size,
|
26
27
|
get_tensor_model_parallel_rank,
|
27
28
|
get_tensor_model_parallel_world_size,
|
28
29
|
parallel_state,
|
@@ -49,12 +50,9 @@ from sglang.srt.layers.linear import (
|
|
49
50
|
RowParallelLinear,
|
50
51
|
)
|
51
52
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
52
|
-
from sglang.srt.layers.moe.ep_moe.layer import
|
53
|
-
DeepEPMoE,
|
54
|
-
get_moe_impl_class,
|
55
|
-
should_use_flashinfer_trtllm_moe,
|
56
|
-
)
|
53
|
+
from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
|
57
54
|
from sglang.srt.layers.moe.topk import TopK
|
55
|
+
from sglang.srt.layers.moe.utils import should_use_flashinfer_trtllm_moe
|
58
56
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
59
57
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
60
58
|
is_fp8_fnuz,
|
@@ -83,7 +81,6 @@ from sglang.srt.two_batch_overlap import (
|
|
83
81
|
)
|
84
82
|
from sglang.srt.utils import (
|
85
83
|
BumpAllocator,
|
86
|
-
DeepEPMode,
|
87
84
|
LazyValue,
|
88
85
|
add_prefix,
|
89
86
|
bind_or_assign,
|
@@ -163,7 +160,7 @@ class Glm4MoeMLP(nn.Module):
|
|
163
160
|
|
164
161
|
gate_up, _ = self.gate_up_proj(x)
|
165
162
|
x = self.act_fn(gate_up)
|
166
|
-
x, _ = self.down_proj(x,
|
163
|
+
x, _ = self.down_proj(x, skip_all_reduce=can_fuse_mlp_allreduce)
|
167
164
|
return x
|
168
165
|
|
169
166
|
|
@@ -344,7 +341,7 @@ class Glm4MoeGate(nn.Module):
|
|
344
341
|
torch.empty((config.n_routed_experts, config.hidden_size))
|
345
342
|
)
|
346
343
|
self.e_score_correction_bias = nn.Parameter(
|
347
|
-
torch.empty((config.n_routed_experts))
|
344
|
+
torch.empty((config.n_routed_experts), dtype=torch.float32)
|
348
345
|
)
|
349
346
|
if _is_cpu and _is_cpu_amx_available:
|
350
347
|
self.quant_method = PackWeightMethod(weight_names=["weight"])
|
@@ -388,6 +385,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
388
385
|
):
|
389
386
|
nn.Module.__init__(self)
|
390
387
|
self.tp_size = get_tensor_model_parallel_world_size()
|
388
|
+
self.ep_size = get_moe_expert_parallel_world_size()
|
391
389
|
self.routed_scaling_factor = config.routed_scaling_factor
|
392
390
|
self.n_shared_experts = config.n_shared_experts
|
393
391
|
self.num_fused_shared_experts = (
|
@@ -443,15 +441,14 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
443
441
|
routed_scaling_factor=self.routed_scaling_factor,
|
444
442
|
prefix=add_prefix("experts", prefix),
|
445
443
|
**(
|
446
|
-
dict(deepep_mode=
|
447
|
-
if global_server_args_dict["
|
444
|
+
dict(deepep_mode=global_server_args_dict["deepep_mode"])
|
445
|
+
if global_server_args_dict["moe_a2a_backend"].is_deepep()
|
448
446
|
else {}
|
449
447
|
),
|
450
448
|
# Additional args for FusedMoE
|
451
449
|
**(
|
452
450
|
dict(
|
453
451
|
enable_flashinfer_cutlass_moe=True,
|
454
|
-
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
|
455
452
|
)
|
456
453
|
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
457
454
|
else {}
|
@@ -482,11 +479,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
482
479
|
quant_config=quant_config,
|
483
480
|
reduce_results=False,
|
484
481
|
prefix=add_prefix("shared_experts", prefix),
|
485
|
-
**(
|
486
|
-
dict(tp_rank=0, tp_size=1)
|
487
|
-
if global_server_args_dict["enable_deepep_moe"]
|
488
|
-
else {}
|
489
|
-
),
|
482
|
+
**(dict(tp_rank=0, tp_size=1) if self.ep_size > 1 else {}),
|
490
483
|
)
|
491
484
|
is_packed_weight = hasattr(
|
492
485
|
self.shared_experts.gate_up_proj.quant_method, "quant_config"
|
@@ -502,9 +495,9 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
502
495
|
|
503
496
|
self.top_k = config.num_experts_per_tok
|
504
497
|
|
505
|
-
if global_server_args_dict["
|
498
|
+
if global_server_args_dict["moe_a2a_backend"].is_deepep():
|
506
499
|
# TODO: we will support tp < ep in the future
|
507
|
-
self.ep_size =
|
500
|
+
self.ep_size = get_moe_expert_parallel_world_size()
|
508
501
|
self.num_experts = (
|
509
502
|
config.n_routed_experts
|
510
503
|
+ global_server_args_dict["ep_num_redundant_experts"]
|
@@ -526,12 +519,97 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
526
519
|
num_local_experts=config.n_routed_experts // self.tp_size,
|
527
520
|
hidden_size=config.hidden_size,
|
528
521
|
params_dtype=config.torch_dtype,
|
529
|
-
deepep_mode=
|
522
|
+
deepep_mode=global_server_args_dict["deepep_mode"],
|
530
523
|
async_finish=True,
|
531
524
|
return_recv_hook=True,
|
532
525
|
)
|
533
526
|
|
534
|
-
self._enable_deepep_moe = global_server_args_dict["
|
527
|
+
self._enable_deepep_moe = global_server_args_dict["moe_a2a_backend"].is_deepep()
|
528
|
+
|
529
|
+
def forward_normal_dual_stream(
|
530
|
+
self,
|
531
|
+
hidden_states: torch.Tensor,
|
532
|
+
can_fuse_mlp_allreduce: bool = False,
|
533
|
+
use_reduce_scatter: bool = False,
|
534
|
+
) -> torch.Tensor:
|
535
|
+
|
536
|
+
current_stream = torch.cuda.current_stream()
|
537
|
+
self.alt_stream.wait_stream(current_stream)
|
538
|
+
shared_output = self._forward_shared_experts(hidden_states)
|
539
|
+
|
540
|
+
with torch.cuda.stream(self.alt_stream):
|
541
|
+
# router_logits: (num_tokens, n_experts)
|
542
|
+
router_logits = self.gate(hidden_states)
|
543
|
+
kwargs = {"hidden_states": hidden_states}
|
544
|
+
if self.topk is not None:
|
545
|
+
kwargs["topk_output"] = self.topk(hidden_states, router_logits)
|
546
|
+
else:
|
547
|
+
kwargs["router_logits"] = router_logits
|
548
|
+
final_hidden_states = self.experts(**kwargs)
|
549
|
+
if not _is_cuda:
|
550
|
+
final_hidden_states *= self.routed_scaling_factor
|
551
|
+
current_stream.wait_stream(self.alt_stream)
|
552
|
+
|
553
|
+
if self.ep_size > 1:
|
554
|
+
if (
|
555
|
+
self.tp_size > 1
|
556
|
+
and not can_fuse_mlp_allreduce
|
557
|
+
and not use_reduce_scatter
|
558
|
+
):
|
559
|
+
final_hidden_states = tensor_model_parallel_all_reduce(
|
560
|
+
final_hidden_states
|
561
|
+
)
|
562
|
+
final_hidden_states += shared_output
|
563
|
+
else:
|
564
|
+
final_hidden_states += shared_output
|
565
|
+
if (
|
566
|
+
self.tp_size > 1
|
567
|
+
and not can_fuse_mlp_allreduce
|
568
|
+
and not use_reduce_scatter
|
569
|
+
):
|
570
|
+
final_hidden_states = tensor_model_parallel_all_reduce(
|
571
|
+
final_hidden_states
|
572
|
+
)
|
573
|
+
return final_hidden_states
|
574
|
+
|
575
|
+
def forward_normal(
|
576
|
+
self,
|
577
|
+
hidden_states: torch.Tensor,
|
578
|
+
can_fuse_mlp_allreduce: bool = False,
|
579
|
+
use_reduce_scatter: bool = False,
|
580
|
+
) -> torch.Tensor:
|
581
|
+
if hasattr(self, "shared_experts") and use_intel_amx_backend(
|
582
|
+
self.shared_experts.gate_up_proj
|
583
|
+
):
|
584
|
+
return self.forward_cpu(hidden_states, can_fuse_mlp_allreduce)
|
585
|
+
|
586
|
+
shared_output = self._forward_shared_experts(hidden_states)
|
587
|
+
# router_logits: (num_tokens, n_experts)
|
588
|
+
router_logits = self.gate(hidden_states)
|
589
|
+
kwargs = {"hidden_states": hidden_states}
|
590
|
+
if self.topk is not None:
|
591
|
+
kwargs["topk_output"] = self.topk(hidden_states, router_logits)
|
592
|
+
else:
|
593
|
+
kwargs["router_logits"] = router_logits
|
594
|
+
final_hidden_states = self.experts(**kwargs)
|
595
|
+
if not _is_cuda and not _use_aiter:
|
596
|
+
# fused in biased_grouped_topk so we can skip here
|
597
|
+
final_hidden_states *= self.routed_scaling_factor
|
598
|
+
if self.ep_size > 1:
|
599
|
+
if self.tp_size > 1 and not can_fuse_mlp_allreduce:
|
600
|
+
final_hidden_states = tensor_model_parallel_all_reduce(
|
601
|
+
final_hidden_states
|
602
|
+
)
|
603
|
+
if shared_output is not None:
|
604
|
+
final_hidden_states += shared_output
|
605
|
+
else:
|
606
|
+
if shared_output is not None:
|
607
|
+
final_hidden_states += shared_output
|
608
|
+
if self.tp_size > 1 and not can_fuse_mlp_allreduce:
|
609
|
+
final_hidden_states = tensor_model_parallel_all_reduce(
|
610
|
+
final_hidden_states
|
611
|
+
)
|
612
|
+
return final_hidden_states
|
535
613
|
|
536
614
|
|
537
615
|
class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
|
@@ -617,6 +695,7 @@ class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
|
|
617
695
|
layer_scatter_modes=self.layer_scatter_modes,
|
618
696
|
input_layernorm=self.input_layernorm,
|
619
697
|
post_attention_layernorm=self.post_attention_layernorm,
|
698
|
+
allow_reduce_scatter=True,
|
620
699
|
)
|
621
700
|
|
622
701
|
def forward(
|
@@ -721,7 +800,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
|
|
721
800
|
)
|
722
801
|
|
723
802
|
def determine_num_fused_shared_experts(
|
724
|
-
self, architecture: str = "
|
803
|
+
self, architecture: str = "Glm4MoeForCausalLM"
|
725
804
|
):
|
726
805
|
self.num_fused_shared_experts = 0
|
727
806
|
if global_server_args_dict["disable_shared_experts_fusion"]:
|
@@ -733,15 +812,11 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
|
|
733
812
|
not _is_cuda
|
734
813
|
or torch.cuda.get_device_capability("cuda") < (8, 0)
|
735
814
|
or self.config.architectures[0] != architecture
|
736
|
-
or self.config.n_routed_experts != 128
|
737
815
|
or self.config.n_shared_experts != 1
|
738
816
|
):
|
739
817
|
disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
|
740
|
-
elif (
|
741
|
-
|
742
|
-
or global_server_args_dict["enable_ep_moe"]
|
743
|
-
):
|
744
|
-
disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization when in deepep_moe or ep_moe mode."
|
818
|
+
elif get_moe_expert_parallel_world_size() > 1:
|
819
|
+
disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
|
745
820
|
|
746
821
|
if disable_reason is not None:
|
747
822
|
global_server_args_dict["disable_shared_experts_fusion"] = True
|