sglang 0.4.10__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +20 -0
- sglang/compile_deep_gemm.py +8 -1
- sglang/global_config.py +5 -1
- sglang/srt/configs/model_config.py +1 -0
- sglang/srt/conversation.py +0 -112
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +1 -0
- sglang/srt/disaggregation/launch_lb.py +5 -20
- sglang/srt/disaggregation/mooncake/conn.py +33 -15
- sglang/srt/disaggregation/prefill.py +1 -0
- sglang/srt/distributed/device_communicators/pynccl.py +7 -0
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
- sglang/srt/distributed/parallel_state.py +11 -0
- sglang/srt/entrypoints/engine.py +4 -2
- sglang/srt/entrypoints/http_server.py +35 -15
- sglang/srt/eplb/expert_distribution.py +4 -2
- sglang/srt/hf_transformers_utils.py +25 -10
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/flashattention_backend.py +7 -11
- sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
- sglang/srt/layers/attention/utils.py +6 -1
- sglang/srt/layers/attention/vision.py +27 -10
- sglang/srt/layers/communicator.py +14 -4
- sglang/srt/layers/linear.py +7 -1
- sglang/srt/layers/logits_processor.py +9 -1
- sglang/srt/layers/moe/ep_moe/layer.py +29 -68
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +82 -25
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +0 -31
- sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
- sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
- sglang/srt/layers/moe/utils.py +43 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
- sglang/srt/layers/quantization/fp8.py +57 -1
- sglang/srt/layers/quantization/fp8_kernel.py +0 -4
- sglang/srt/layers/quantization/w8a8_int8.py +4 -1
- sglang/srt/layers/vocab_parallel_embedding.py +7 -1
- sglang/srt/lora/lora_registry.py +7 -0
- sglang/srt/managers/cache_controller.py +43 -39
- sglang/srt/managers/data_parallel_controller.py +52 -2
- sglang/srt/managers/io_struct.py +6 -1
- sglang/srt/managers/schedule_batch.py +3 -2
- sglang/srt/managers/schedule_policy.py +3 -1
- sglang/srt/managers/scheduler.py +145 -6
- sglang/srt/managers/template_manager.py +25 -22
- sglang/srt/managers/tokenizer_manager.py +114 -62
- sglang/srt/managers/utils.py +45 -1
- sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
- sglang/srt/mem_cache/hicache_storage.py +13 -12
- sglang/srt/mem_cache/hiradix_cache.py +21 -4
- sglang/srt/mem_cache/memory_pool.py +15 -118
- sglang/srt/mem_cache/memory_pool_host.py +350 -33
- sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +8 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +163 -0
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +238 -0
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +216 -0
- sglang/srt/model_executor/cuda_graph_runner.py +42 -4
- sglang/srt/model_executor/forward_batch_info.py +13 -3
- sglang/srt/model_executor/model_runner.py +13 -1
- sglang/srt/model_loader/weight_utils.py +2 -0
- sglang/srt/models/deepseek_v2.py +28 -23
- sglang/srt/models/glm4_moe.py +85 -22
- sglang/srt/models/grok.py +3 -3
- sglang/srt/models/llama4.py +13 -2
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mllama4.py +428 -19
- sglang/srt/models/qwen2_moe.py +1 -4
- sglang/srt/models/qwen3_moe.py +7 -8
- sglang/srt/models/step3_vl.py +1 -4
- sglang/srt/multimodal/processors/base_processor.py +4 -3
- sglang/srt/multimodal/processors/gemma3n.py +0 -7
- sglang/srt/operations_strategy.py +1 -1
- sglang/srt/server_args.py +115 -21
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
- sglang/srt/two_batch_overlap.py +6 -4
- sglang/srt/utils.py +4 -24
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +945 -0
- sglang/test/runners.py +2 -2
- sglang/test/test_utils.py +3 -3
- sglang/version.py +1 -1
- {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/METADATA +3 -2
- {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/RECORD +92 -81
- /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
- /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
- {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/top_level.txt +0 -0
sglang/srt/models/qwen3_moe.py
CHANGED
@@ -24,6 +24,7 @@ import torch
|
|
24
24
|
from torch import nn
|
25
25
|
|
26
26
|
from sglang.srt.distributed import (
|
27
|
+
get_moe_expert_parallel_world_size,
|
27
28
|
get_pp_group,
|
28
29
|
get_tensor_model_parallel_rank,
|
29
30
|
get_tensor_model_parallel_world_size,
|
@@ -51,7 +52,6 @@ from sglang.srt.layers.linear import (
|
|
51
52
|
)
|
52
53
|
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
53
54
|
from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
|
54
|
-
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
|
55
55
|
from sglang.srt.layers.moe.topk import TopK
|
56
56
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
57
57
|
from sglang.srt.layers.radix_attention import RadixAttention
|
@@ -72,7 +72,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
|
|
72
72
|
from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
|
73
73
|
from sglang.srt.models.qwen2_moe import Qwen2MoeModel
|
74
74
|
from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
|
75
|
-
from sglang.srt.utils import
|
75
|
+
from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty
|
76
76
|
|
77
77
|
Qwen3MoeConfig = None
|
78
78
|
|
@@ -113,15 +113,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
113
113
|
quant_config=quant_config,
|
114
114
|
prefix=add_prefix("experts", prefix),
|
115
115
|
**(
|
116
|
-
dict(deepep_mode=
|
117
|
-
if global_server_args_dict["
|
116
|
+
dict(deepep_mode=global_server_args_dict["deepep_mode"])
|
117
|
+
if global_server_args_dict["moe_a2a_backend"].is_deepep()
|
118
118
|
else {}
|
119
119
|
),
|
120
120
|
# Additional args for FusedMoE
|
121
121
|
**(
|
122
122
|
dict(
|
123
123
|
enable_flashinfer_cutlass_moe=True,
|
124
|
-
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
|
125
124
|
)
|
126
125
|
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
127
126
|
else {}
|
@@ -136,9 +135,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
136
135
|
prefix=add_prefix("gate", prefix),
|
137
136
|
)
|
138
137
|
|
139
|
-
if global_server_args_dict["
|
138
|
+
if global_server_args_dict["moe_a2a_backend"].is_deepep():
|
140
139
|
# TODO: we will support tp < ep in the future
|
141
|
-
self.ep_size =
|
140
|
+
self.ep_size = get_moe_expert_parallel_world_size()
|
142
141
|
self.num_experts = (
|
143
142
|
config.num_experts + global_server_args_dict["ep_num_redundant_experts"]
|
144
143
|
)
|
@@ -148,7 +147,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
148
147
|
self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
|
149
148
|
) -> torch.Tensor:
|
150
149
|
|
151
|
-
if not global_server_args_dict["
|
150
|
+
if not global_server_args_dict["moe_a2a_backend"].is_deepep():
|
152
151
|
return self.forward_normal(hidden_states)
|
153
152
|
else:
|
154
153
|
return self.forward_deepep(hidden_states, forward_batch)
|
sglang/srt/models/step3_vl.py
CHANGED
@@ -146,7 +146,7 @@ class Step3TextMoEMLP(nn.Module):
|
|
146
146
|
prefix=add_prefix("gate", prefix),
|
147
147
|
)
|
148
148
|
|
149
|
-
if global_server_args_dict["
|
149
|
+
if global_server_args_dict["moe_a2a_backend"].is_deepep():
|
150
150
|
raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.")
|
151
151
|
|
152
152
|
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
@@ -868,7 +868,6 @@ class Step3VLForConditionalGeneration(nn.Module):
|
|
868
868
|
)
|
869
869
|
|
870
870
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
871
|
-
# TODO:
|
872
871
|
stacked_params_mapping = [
|
873
872
|
# (param_name, shard_name, shard_id)
|
874
873
|
(".qkv_proj", ".q_proj", 0),
|
@@ -901,9 +900,7 @@ class Step3VLForConditionalGeneration(nn.Module):
|
|
901
900
|
|
902
901
|
for name, loaded_weight in weights:
|
903
902
|
if "vision_model" in name:
|
904
|
-
# 1.It’s not great, but let’s leave it like this for now
|
905
903
|
name = name.replace("self_attn", "self_attn.attn")
|
906
|
-
# 2.
|
907
904
|
name = name.replace("out_proj", "proj")
|
908
905
|
|
909
906
|
# TODO: support vision model
|
@@ -12,7 +12,6 @@ import torch
|
|
12
12
|
from PIL import Image
|
13
13
|
from transformers import BaseImageProcessorFast
|
14
14
|
|
15
|
-
from sglang.srt.managers.mm_utils import TransportProxyTensor
|
16
15
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
17
16
|
from sglang.srt.utils import load_audio, load_image, load_video, logger
|
18
17
|
|
@@ -218,8 +217,10 @@ class BaseMultimodalProcessor(ABC):
|
|
218
217
|
kwargs["audio"] = audios
|
219
218
|
|
220
219
|
processor = self._processor
|
221
|
-
if
|
222
|
-
processor
|
220
|
+
if (
|
221
|
+
hasattr(processor, "image_processor")
|
222
|
+
and isinstance(processor.image_processor, BaseImageProcessorFast)
|
223
|
+
and not self.server_args.disable_fast_image_processor
|
223
224
|
):
|
224
225
|
kwargs["device"] = "cuda"
|
225
226
|
result = processor.__call__(
|
@@ -12,7 +12,6 @@
|
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
14
|
|
15
|
-
import re
|
16
15
|
from typing import Dict, List, Optional, Union
|
17
16
|
|
18
17
|
from sglang.srt.managers.multimodal_processor import (
|
@@ -38,14 +37,8 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
|
|
38
37
|
self.mm_tokens = MultimodalSpecialTokens(
|
39
38
|
image_token="<image_soft_token>",
|
40
39
|
image_token_id=hf_config.image_token_id,
|
41
|
-
image_token_regex=re.compile(
|
42
|
-
r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
|
43
|
-
),
|
44
40
|
audio_token="<audio_soft_token>",
|
45
41
|
audio_token_id=hf_config.audio_token_id,
|
46
|
-
audio_token_regex=re.compile(
|
47
|
-
r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
|
48
|
-
),
|
49
42
|
).build(_processor)
|
50
43
|
|
51
44
|
async def process_mm_data_async(
|
@@ -4,7 +4,7 @@ from typing import List, Optional
|
|
4
4
|
import torch
|
5
5
|
|
6
6
|
from sglang.srt import operations
|
7
|
-
from sglang.srt.layers.moe.
|
7
|
+
from sglang.srt.layers.moe.token_dispatcher import DeepEPConfig
|
8
8
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
9
9
|
from sglang.srt.operations import Operation
|
10
10
|
|
sglang/srt/server_args.py
CHANGED
@@ -24,6 +24,7 @@ import tempfile
|
|
24
24
|
from typing import List, Literal, Optional, Union
|
25
25
|
|
26
26
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
27
|
+
from sglang.srt.layers.utils import is_sm100_supported
|
27
28
|
from sglang.srt.lora.lora_registry import LoRARef
|
28
29
|
from sglang.srt.reasoning_parser import ReasoningParser
|
29
30
|
from sglang.srt.utils import (
|
@@ -148,6 +149,7 @@ class ServerArgs:
|
|
148
149
|
max_lora_rank: Optional[int] = None
|
149
150
|
lora_target_modules: Optional[Union[set[str], List[str]]] = None
|
150
151
|
lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
|
152
|
+
max_loaded_loras: Optional[int] = None
|
151
153
|
max_loras_per_batch: int = 8
|
152
154
|
lora_backend: str = "triton"
|
153
155
|
|
@@ -171,12 +173,11 @@ class ServerArgs:
|
|
171
173
|
|
172
174
|
# Expert parallelism
|
173
175
|
ep_size: int = 1
|
174
|
-
|
175
|
-
enable_deepep_moe: bool = False
|
176
|
+
moe_a2a_backend: Optional[Literal["deepep"]] = None
|
176
177
|
enable_flashinfer_cutlass_moe: bool = False
|
177
178
|
enable_flashinfer_trtllm_moe: bool = False
|
178
179
|
enable_flashinfer_allreduce_fusion: bool = False
|
179
|
-
deepep_mode:
|
180
|
+
deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
|
180
181
|
ep_num_redundant_experts: int = 0
|
181
182
|
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
182
183
|
init_expert_location: str = "trivial"
|
@@ -197,7 +198,8 @@ class ServerArgs:
|
|
197
198
|
hicache_ratio: float = 2.0
|
198
199
|
hicache_size: int = 0
|
199
200
|
hicache_write_policy: str = "write_through_selective"
|
200
|
-
hicache_io_backend: str = ""
|
201
|
+
hicache_io_backend: str = "kernel"
|
202
|
+
hicache_mem_layout: str = "layer_first"
|
201
203
|
hicache_storage_backend: Optional[str] = None
|
202
204
|
|
203
205
|
# Double Sparsity
|
@@ -215,7 +217,9 @@ class ServerArgs:
|
|
215
217
|
disable_cuda_graph: bool = False
|
216
218
|
disable_cuda_graph_padding: bool = False
|
217
219
|
enable_profile_cuda_graph: bool = False
|
220
|
+
enable_cudagraph_gc: bool = False
|
218
221
|
enable_nccl_nvls: bool = False
|
222
|
+
enable_symm_mem: bool = False
|
219
223
|
enable_tokenizer_batch_encode: bool = False
|
220
224
|
disable_outlines_disk_cache: bool = False
|
221
225
|
disable_custom_all_reduce: bool = False
|
@@ -269,7 +273,27 @@ class ServerArgs:
|
|
269
273
|
enable_pdmux: bool = False
|
270
274
|
sm_group_num: int = 3
|
271
275
|
|
276
|
+
# Deprecated arguments
|
277
|
+
enable_ep_moe: bool = False
|
278
|
+
enable_deepep_moe: bool = False
|
279
|
+
|
272
280
|
def __post_init__(self):
|
281
|
+
|
282
|
+
# Check deprecated arguments
|
283
|
+
def print_deprecated_warning(message: str):
|
284
|
+
logger.warning(f"\033[33m{message}\033[0m")
|
285
|
+
|
286
|
+
if self.enable_ep_moe:
|
287
|
+
self.ep_size = self.tp_size
|
288
|
+
print_deprecated_warning(
|
289
|
+
"NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
|
290
|
+
)
|
291
|
+
if self.enable_deepep_moe:
|
292
|
+
self.moe_a2a_backend = "deepep"
|
293
|
+
print_deprecated_warning(
|
294
|
+
"NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
|
295
|
+
)
|
296
|
+
|
273
297
|
# Set missing default values
|
274
298
|
if self.tokenizer_path is None:
|
275
299
|
self.tokenizer_path = self.model_path
|
@@ -401,6 +425,22 @@ class ServerArgs:
|
|
401
425
|
)
|
402
426
|
self.page_size = 128
|
403
427
|
|
428
|
+
if self.attention_backend == "trtllm_mla":
|
429
|
+
if not is_sm100_supported():
|
430
|
+
raise ValueError(
|
431
|
+
"TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
|
432
|
+
)
|
433
|
+
|
434
|
+
if self.page_size not in [32, 64]:
|
435
|
+
logger.warning(
|
436
|
+
f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
|
437
|
+
)
|
438
|
+
self.page_size = 64
|
439
|
+
if self.speculative_algorithm is not None:
|
440
|
+
raise ValueError(
|
441
|
+
"trtllm_mla backend does not support speculative decoding yet."
|
442
|
+
)
|
443
|
+
|
404
444
|
# Set page size
|
405
445
|
if self.page_size is None:
|
406
446
|
self.page_size = 1
|
@@ -436,13 +476,13 @@ class ServerArgs:
|
|
436
476
|
self.quantization == "modelopt_fp4"
|
437
477
|
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
438
478
|
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
479
|
+
assert self.ep_size in [
|
480
|
+
1,
|
481
|
+
self.tp_size,
|
482
|
+
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
443
483
|
|
444
484
|
# DeepEP MoE
|
445
|
-
if self.
|
485
|
+
if self.moe_a2a_backend == "deepep":
|
446
486
|
if self.deepep_mode == "normal":
|
447
487
|
logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
|
448
488
|
self.disable_cuda_graph = True
|
@@ -466,7 +506,7 @@ class ServerArgs:
|
|
466
506
|
)
|
467
507
|
|
468
508
|
if self.enable_eplb:
|
469
|
-
assert self.
|
509
|
+
assert self.ep_size > 1 or self.moe_a2a_backend is not None
|
470
510
|
|
471
511
|
if self.enable_expert_distribution_metrics and (
|
472
512
|
self.expert_distribution_recorder_mode is None
|
@@ -1131,6 +1171,7 @@ class ServerArgs:
|
|
1131
1171
|
choices=[
|
1132
1172
|
"round_robin",
|
1133
1173
|
"shortest_queue",
|
1174
|
+
"minimum_tokens",
|
1134
1175
|
],
|
1135
1176
|
)
|
1136
1177
|
|
@@ -1198,6 +1239,12 @@ class ServerArgs:
|
|
1198
1239
|
default=8,
|
1199
1240
|
help="Maximum number of adapters for a running batch, include base-only request.",
|
1200
1241
|
)
|
1242
|
+
parser.add_argument(
|
1243
|
+
"--max-loaded-loras",
|
1244
|
+
type=int,
|
1245
|
+
default=ServerArgs.max_loaded_loras,
|
1246
|
+
help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
|
1247
|
+
)
|
1201
1248
|
parser.add_argument(
|
1202
1249
|
"--lora-backend",
|
1203
1250
|
type=str,
|
@@ -1219,6 +1266,7 @@ class ServerArgs:
|
|
1219
1266
|
"torch_native",
|
1220
1267
|
"ascend",
|
1221
1268
|
"triton",
|
1269
|
+
"trtllm_mla",
|
1222
1270
|
],
|
1223
1271
|
default=ServerArgs.attention_backend,
|
1224
1272
|
help="Choose the kernels for attention layers.",
|
@@ -1333,30 +1381,27 @@ class ServerArgs:
|
|
1333
1381
|
help="The expert parallelism size.",
|
1334
1382
|
)
|
1335
1383
|
parser.add_argument(
|
1336
|
-
"--
|
1337
|
-
|
1338
|
-
|
1384
|
+
"--moe-a2a-backend",
|
1385
|
+
type=str,
|
1386
|
+
choices=["deepep"],
|
1387
|
+
default=ServerArgs.moe_a2a_backend,
|
1388
|
+
help="Choose the backend for MoE A2A.",
|
1339
1389
|
)
|
1340
1390
|
parser.add_argument(
|
1341
1391
|
"--enable-flashinfer-cutlass-moe",
|
1342
1392
|
action="store_true",
|
1343
|
-
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP
|
1393
|
+
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
|
1344
1394
|
)
|
1345
1395
|
parser.add_argument(
|
1346
1396
|
"--enable-flashinfer-trtllm-moe",
|
1347
1397
|
action="store_true",
|
1348
|
-
help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP
|
1398
|
+
help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
|
1349
1399
|
)
|
1350
1400
|
parser.add_argument(
|
1351
1401
|
"--enable-flashinfer-allreduce-fusion",
|
1352
1402
|
action="store_true",
|
1353
1403
|
help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
|
1354
1404
|
)
|
1355
|
-
parser.add_argument(
|
1356
|
-
"--enable-deepep-moe",
|
1357
|
-
action="store_true",
|
1358
|
-
help="Enabling DeepEP MoE implementation for EP MoE.",
|
1359
|
-
)
|
1360
1405
|
parser.add_argument(
|
1361
1406
|
"--deepep-mode",
|
1362
1407
|
type=str,
|
@@ -1467,10 +1512,18 @@ class ServerArgs:
|
|
1467
1512
|
default=ServerArgs.hicache_io_backend,
|
1468
1513
|
help="The IO backend for KV cache transfer between CPU and GPU",
|
1469
1514
|
)
|
1515
|
+
parser.add_argument(
|
1516
|
+
"--hicache-mem-layout",
|
1517
|
+
type=str,
|
1518
|
+
choices=["layer_first", "page_first"],
|
1519
|
+
default=ServerArgs.hicache_mem_layout,
|
1520
|
+
help="The layout of host memory pool for hierarchical cache.",
|
1521
|
+
)
|
1522
|
+
|
1470
1523
|
parser.add_argument(
|
1471
1524
|
"--hicache-storage-backend",
|
1472
1525
|
type=str,
|
1473
|
-
choices=["file", "mooncake", "hf3fs"],
|
1526
|
+
choices=["file", "mooncake", "hf3fs", "nixl"],
|
1474
1527
|
default=ServerArgs.hicache_storage_backend,
|
1475
1528
|
help="The storage backend for hierarchical KV cache.",
|
1476
1529
|
)
|
@@ -1545,11 +1598,21 @@ class ServerArgs:
|
|
1545
1598
|
action="store_true",
|
1546
1599
|
help="Enable profiling of cuda graph capture.",
|
1547
1600
|
)
|
1601
|
+
parser.add_argument(
|
1602
|
+
"--enable-cudagraph-gc",
|
1603
|
+
action="store_true",
|
1604
|
+
help="Enable garbage collection during CUDA graph capture. If disabled (default), GC is frozen during capture to speed up the process.",
|
1605
|
+
)
|
1548
1606
|
parser.add_argument(
|
1549
1607
|
"--enable-nccl-nvls",
|
1550
1608
|
action="store_true",
|
1551
1609
|
help="Enable NCCL NVLS for prefill heavy requests when available.",
|
1552
1610
|
)
|
1611
|
+
parser.add_argument(
|
1612
|
+
"--enable-symm-mem",
|
1613
|
+
action="store_true",
|
1614
|
+
help="Enable NCCL symmetric memory for fast collectives.",
|
1615
|
+
)
|
1553
1616
|
parser.add_argument(
|
1554
1617
|
"--enable-tokenizer-batch-encode",
|
1555
1618
|
action="store_true",
|
@@ -1805,6 +1868,18 @@ class ServerArgs:
|
|
1805
1868
|
help="Disable mmap while loading weight using safetensors.",
|
1806
1869
|
)
|
1807
1870
|
|
1871
|
+
# Deprecated arguments
|
1872
|
+
parser.add_argument(
|
1873
|
+
"--enable-ep-moe",
|
1874
|
+
action="store_true",
|
1875
|
+
help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
1876
|
+
)
|
1877
|
+
parser.add_argument(
|
1878
|
+
"--enable-deepep-moe",
|
1879
|
+
action="store_true",
|
1880
|
+
help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
|
1881
|
+
)
|
1882
|
+
|
1808
1883
|
@classmethod
|
1809
1884
|
def from_cli_args(cls, args: argparse.Namespace):
|
1810
1885
|
args.tp_size = args.tensor_parallel_size
|
@@ -1861,6 +1936,12 @@ class ServerArgs:
|
|
1861
1936
|
if "Llama4" in model_arch:
|
1862
1937
|
assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
|
1863
1938
|
|
1939
|
+
if "Gemma2ForCausalLM" in model_arch:
|
1940
|
+
# FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
|
1941
|
+
# It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
|
1942
|
+
logger.warning("Disable hybrid SWA memory for Gemma2ForCausalLM.")
|
1943
|
+
self.disable_hybrid_swa_memory = True
|
1944
|
+
|
1864
1945
|
# Check LoRA
|
1865
1946
|
self.check_lora_server_args()
|
1866
1947
|
|
@@ -1935,6 +2016,19 @@ class ServerArgs:
|
|
1935
2016
|
self.max_lora_rank and self.lora_target_modules
|
1936
2017
|
), "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."
|
1937
2018
|
|
2019
|
+
# Validate max_loaded_loras
|
2020
|
+
if self.max_loaded_loras is not None:
|
2021
|
+
assert self.max_loaded_loras >= self.max_loras_per_batch, (
|
2022
|
+
"max_loaded_loras should be greater than or equal to max_loras_per_batch. "
|
2023
|
+
f"max_loaded_loras={self.max_loaded_loras}, max_loras_per_batch={self.max_loras_per_batch}"
|
2024
|
+
)
|
2025
|
+
assert (
|
2026
|
+
not self.lora_paths or len(self.lora_paths) <= self.max_loaded_loras
|
2027
|
+
), (
|
2028
|
+
"The number of LoRA paths should not exceed max_loaded_loras. "
|
2029
|
+
f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
|
2030
|
+
)
|
2031
|
+
|
1938
2032
|
def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
|
1939
2033
|
larger_tp = max(decode_tp, prefill_tp)
|
1940
2034
|
smaller_tp = min(decode_tp, prefill_tp)
|
@@ -142,6 +142,22 @@ class EAGLEDraftExtendCudaGraphRunner:
|
|
142
142
|
self.global_num_tokens_for_logprob_gpu = None
|
143
143
|
self.gathered_buffer = None
|
144
144
|
|
145
|
+
if hasattr(
|
146
|
+
self.model_runner.model_config.hf_config, "draft_vocab_size"
|
147
|
+
): # llama_eagle
|
148
|
+
vocab_size = self.model_runner.model_config.hf_config.draft_vocab_size
|
149
|
+
elif hasattr(
|
150
|
+
self.model_runner.model_config.hf_config, "hot_vocab_size"
|
151
|
+
): # llama_eagle3
|
152
|
+
vocab_size = self.model_runner.model_config.hf_config.hot_vocab_size
|
153
|
+
else:
|
154
|
+
vocab_size = self.model_runner.model_config.vocab_size
|
155
|
+
|
156
|
+
self.next_token_logits_buffer = torch.zeros(
|
157
|
+
(self.max_bs, vocab_size),
|
158
|
+
dtype=torch.float,
|
159
|
+
)
|
160
|
+
|
145
161
|
# Capture
|
146
162
|
try:
|
147
163
|
with model_capture_mode():
|
@@ -189,6 +205,7 @@ class EAGLEDraftExtendCudaGraphRunner:
|
|
189
205
|
out_cache_loc = self.out_cache_loc[:num_tokens]
|
190
206
|
positions = self.positions[:num_tokens]
|
191
207
|
hidden_states = self.hidden_states[:num_tokens]
|
208
|
+
next_token_logits_buffer = self.next_token_logits_buffer[:bs]
|
192
209
|
|
193
210
|
if self.require_mlp_tp_gather:
|
194
211
|
self.global_num_tokens_gpu.copy_(
|
@@ -238,6 +255,7 @@ class EAGLEDraftExtendCudaGraphRunner:
|
|
238
255
|
input_ids=input_ids,
|
239
256
|
req_pool_indices=req_pool_indices,
|
240
257
|
seq_lens=seq_lens,
|
258
|
+
next_token_logits_buffer=next_token_logits_buffer,
|
241
259
|
req_to_token_pool=self.model_runner.req_to_token_pool,
|
242
260
|
token_to_kv_pool=self.model_runner.token_to_kv_pool,
|
243
261
|
out_cache_loc=out_cache_loc,
|
sglang/srt/two_batch_overlap.py
CHANGED
@@ -13,17 +13,18 @@ from sglang.srt.layers.communicator import (
|
|
13
13
|
CommunicateSummableTensorPairFn,
|
14
14
|
ScatterMode,
|
15
15
|
)
|
16
|
-
from sglang.srt.layers.moe.
|
16
|
+
from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher
|
17
|
+
from sglang.srt.layers.moe.utils import DeepEPMode
|
17
18
|
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
18
19
|
from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
|
19
20
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
20
21
|
from sglang.srt.operations import execute_operations, execute_overlapped_operations
|
21
22
|
from sglang.srt.operations_strategy import OperationsStrategy
|
22
23
|
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
23
|
-
from sglang.srt.utils import BumpAllocator,
|
24
|
+
from sglang.srt.utils import BumpAllocator, get_bool_env_var
|
24
25
|
|
25
26
|
if TYPE_CHECKING:
|
26
|
-
from sglang.srt.layers.moe.
|
27
|
+
from sglang.srt.layers.moe.token_dispatcher import DispatchOutput
|
27
28
|
|
28
29
|
_tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
|
29
30
|
|
@@ -310,7 +311,7 @@ class TboDPAttentionPreparer:
|
|
310
311
|
and not local_batch.forward_mode.is_target_verify()
|
311
312
|
)
|
312
313
|
and enable_deepep_moe
|
313
|
-
and (resolved_deepep_mode == DeepEPMode.
|
314
|
+
and (resolved_deepep_mode == DeepEPMode.LOW_LATENCY)
|
314
315
|
)
|
315
316
|
else:
|
316
317
|
self.local_tbo_split_seq_index = 0
|
@@ -563,6 +564,7 @@ class TboForwardBatchPreparer:
|
|
563
564
|
mm_inputs=None,
|
564
565
|
top_logprobs_nums=None,
|
565
566
|
token_ids_logprobs=None,
|
567
|
+
next_token_logits_buffer=None,
|
566
568
|
)
|
567
569
|
)
|
568
570
|
|
sglang/srt/utils.py
CHANGED
@@ -44,7 +44,6 @@ import traceback
|
|
44
44
|
import warnings
|
45
45
|
from collections import OrderedDict, defaultdict
|
46
46
|
from contextlib import contextmanager
|
47
|
-
from enum import Enum
|
48
47
|
from functools import lru_cache
|
49
48
|
from importlib.metadata import PackageNotFoundError, version
|
50
49
|
from importlib.util import find_spec
|
@@ -93,6 +92,7 @@ logger = logging.getLogger(__name__)
|
|
93
92
|
show_time_cost = False
|
94
93
|
time_infos = {}
|
95
94
|
|
95
|
+
|
96
96
|
HIP_FP8_E4M3_FNUZ_MAX = 224.0
|
97
97
|
|
98
98
|
|
@@ -2205,27 +2205,6 @@ def flatten_nested_list(nested_list):
|
|
2205
2205
|
return [nested_list]
|
2206
2206
|
|
2207
2207
|
|
2208
|
-
class DeepEPMode(Enum):
|
2209
|
-
normal = "normal"
|
2210
|
-
low_latency = "low_latency"
|
2211
|
-
auto = "auto"
|
2212
|
-
|
2213
|
-
def enable_normal(self):
|
2214
|
-
return self in [DeepEPMode.normal, DeepEPMode.auto]
|
2215
|
-
|
2216
|
-
def enable_low_latency(self):
|
2217
|
-
return self in [DeepEPMode.low_latency, DeepEPMode.auto]
|
2218
|
-
|
2219
|
-
def resolve(self, is_extend_in_batch: bool):
|
2220
|
-
if self != DeepEPMode.auto:
|
2221
|
-
return self
|
2222
|
-
|
2223
|
-
if is_extend_in_batch:
|
2224
|
-
return DeepEPMode.normal
|
2225
|
-
else:
|
2226
|
-
return DeepEPMode.low_latency
|
2227
|
-
|
2228
|
-
|
2229
2208
|
def is_non_idle_and_non_empty(forward_mode, hidden_states):
|
2230
2209
|
return (
|
2231
2210
|
(forward_mode is not None)
|
@@ -2344,6 +2323,7 @@ def is_fa3_default_architecture(hf_config):
|
|
2344
2323
|
"Qwen3ForCausalLM",
|
2345
2324
|
"Qwen3MoeForCausalLM",
|
2346
2325
|
"Glm4MoeForCausalLM",
|
2326
|
+
"Step3VLForConditionalGeneration",
|
2347
2327
|
}
|
2348
2328
|
return architectures[0] in default_archs
|
2349
2329
|
|
@@ -2413,7 +2393,7 @@ def require_mlp_tp_gather(server_args):
|
|
2413
2393
|
return True
|
2414
2394
|
elif not server_args.enable_dp_lm_head:
|
2415
2395
|
return True
|
2416
|
-
elif
|
2396
|
+
elif server_args.moe_a2a_backend is None:
|
2417
2397
|
return True
|
2418
2398
|
else:
|
2419
2399
|
return (
|
@@ -2429,7 +2409,7 @@ def require_attn_tp_gather(server_args):
|
|
2429
2409
|
Check if the input of attention is scattered.
|
2430
2410
|
"""
|
2431
2411
|
assert server_args.moe_dense_tp_size in [1, None]
|
2432
|
-
if server_args.
|
2412
|
+
if server_args.moe_a2a_backend is not None or server_args.moe_dense_tp_size == 1:
|
2433
2413
|
if server_args.enable_dp_attention:
|
2434
2414
|
return server_args.dp_size < server_args.tp_size
|
2435
2415
|
else:
|
sglang/srt/weight_sync/utils.py
CHANGED