sglang 0.4.10__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sglang/bench_offline_throughput.py +20 -0
  2. sglang/compile_deep_gemm.py +8 -1
  3. sglang/global_config.py +5 -1
  4. sglang/srt/configs/model_config.py +1 -0
  5. sglang/srt/conversation.py +0 -112
  6. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +1 -0
  7. sglang/srt/disaggregation/launch_lb.py +5 -20
  8. sglang/srt/disaggregation/mooncake/conn.py +33 -15
  9. sglang/srt/disaggregation/prefill.py +1 -0
  10. sglang/srt/distributed/device_communicators/pynccl.py +7 -0
  11. sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
  12. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
  13. sglang/srt/distributed/parallel_state.py +11 -0
  14. sglang/srt/entrypoints/engine.py +4 -2
  15. sglang/srt/entrypoints/http_server.py +35 -15
  16. sglang/srt/eplb/expert_distribution.py +4 -2
  17. sglang/srt/hf_transformers_utils.py +25 -10
  18. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  19. sglang/srt/layers/attention/flashattention_backend.py +7 -11
  20. sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
  21. sglang/srt/layers/attention/utils.py +6 -1
  22. sglang/srt/layers/attention/vision.py +27 -10
  23. sglang/srt/layers/communicator.py +14 -4
  24. sglang/srt/layers/linear.py +7 -1
  25. sglang/srt/layers/logits_processor.py +9 -1
  26. sglang/srt/layers/moe/ep_moe/layer.py +29 -68
  27. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/layer.py +82 -25
  29. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +0 -31
  30. sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
  31. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
  32. sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
  33. sglang/srt/layers/moe/utils.py +43 -0
  34. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
  35. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
  36. sglang/srt/layers/quantization/fp8.py +57 -1
  37. sglang/srt/layers/quantization/fp8_kernel.py +0 -4
  38. sglang/srt/layers/quantization/w8a8_int8.py +4 -1
  39. sglang/srt/layers/vocab_parallel_embedding.py +7 -1
  40. sglang/srt/lora/lora_registry.py +7 -0
  41. sglang/srt/managers/cache_controller.py +43 -39
  42. sglang/srt/managers/data_parallel_controller.py +52 -2
  43. sglang/srt/managers/io_struct.py +6 -1
  44. sglang/srt/managers/schedule_batch.py +3 -2
  45. sglang/srt/managers/schedule_policy.py +3 -1
  46. sglang/srt/managers/scheduler.py +145 -6
  47. sglang/srt/managers/template_manager.py +25 -22
  48. sglang/srt/managers/tokenizer_manager.py +114 -62
  49. sglang/srt/managers/utils.py +45 -1
  50. sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
  51. sglang/srt/mem_cache/hicache_storage.py +13 -12
  52. sglang/srt/mem_cache/hiradix_cache.py +21 -4
  53. sglang/srt/mem_cache/memory_pool.py +15 -118
  54. sglang/srt/mem_cache/memory_pool_host.py +350 -33
  55. sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
  56. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +8 -2
  57. sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
  58. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +163 -0
  59. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +238 -0
  60. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +216 -0
  61. sglang/srt/model_executor/cuda_graph_runner.py +42 -4
  62. sglang/srt/model_executor/forward_batch_info.py +13 -3
  63. sglang/srt/model_executor/model_runner.py +13 -1
  64. sglang/srt/model_loader/weight_utils.py +2 -0
  65. sglang/srt/models/deepseek_v2.py +28 -23
  66. sglang/srt/models/glm4_moe.py +85 -22
  67. sglang/srt/models/grok.py +3 -3
  68. sglang/srt/models/llama4.py +13 -2
  69. sglang/srt/models/mixtral.py +3 -3
  70. sglang/srt/models/mllama4.py +428 -19
  71. sglang/srt/models/qwen2_moe.py +1 -4
  72. sglang/srt/models/qwen3_moe.py +7 -8
  73. sglang/srt/models/step3_vl.py +1 -4
  74. sglang/srt/multimodal/processors/base_processor.py +4 -3
  75. sglang/srt/multimodal/processors/gemma3n.py +0 -7
  76. sglang/srt/operations_strategy.py +1 -1
  77. sglang/srt/server_args.py +115 -21
  78. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
  79. sglang/srt/two_batch_overlap.py +6 -4
  80. sglang/srt/utils.py +4 -24
  81. sglang/srt/weight_sync/utils.py +1 -1
  82. sglang/test/attention/test_trtllm_mla_backend.py +945 -0
  83. sglang/test/runners.py +2 -2
  84. sglang/test/test_utils.py +3 -3
  85. sglang/version.py +1 -1
  86. {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/METADATA +3 -2
  87. {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/RECORD +92 -81
  88. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
  89. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
  90. {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/WHEEL +0 -0
  91. {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/licenses/LICENSE +0 -0
  92. {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ import torch
24
24
  from torch import nn
25
25
 
26
26
  from sglang.srt.distributed import (
27
+ get_moe_expert_parallel_world_size,
27
28
  get_pp_group,
28
29
  get_tensor_model_parallel_rank,
29
30
  get_tensor_model_parallel_world_size,
@@ -51,7 +52,6 @@ from sglang.srt.layers.linear import (
51
52
  )
52
53
  from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
53
54
  from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
54
- from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
55
55
  from sglang.srt.layers.moe.topk import TopK
56
56
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
57
57
  from sglang.srt.layers.radix_attention import RadixAttention
@@ -72,7 +72,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
72
72
  from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
73
73
  from sglang.srt.models.qwen2_moe import Qwen2MoeModel
74
74
  from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
75
- from sglang.srt.utils import DeepEPMode, add_prefix, is_cuda, is_non_idle_and_non_empty
75
+ from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty
76
76
 
77
77
  Qwen3MoeConfig = None
78
78
 
@@ -113,15 +113,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
113
113
  quant_config=quant_config,
114
114
  prefix=add_prefix("experts", prefix),
115
115
  **(
116
- dict(deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]])
117
- if global_server_args_dict["enable_deepep_moe"]
116
+ dict(deepep_mode=global_server_args_dict["deepep_mode"])
117
+ if global_server_args_dict["moe_a2a_backend"].is_deepep()
118
118
  else {}
119
119
  ),
120
120
  # Additional args for FusedMoE
121
121
  **(
122
122
  dict(
123
123
  enable_flashinfer_cutlass_moe=True,
124
- enable_ep_moe=global_server_args_dict["enable_ep_moe"],
125
124
  )
126
125
  if global_server_args_dict["enable_flashinfer_cutlass_moe"]
127
126
  else {}
@@ -136,9 +135,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
136
135
  prefix=add_prefix("gate", prefix),
137
136
  )
138
137
 
139
- if global_server_args_dict["enable_deepep_moe"]:
138
+ if global_server_args_dict["moe_a2a_backend"].is_deepep():
140
139
  # TODO: we will support tp < ep in the future
141
- self.ep_size = get_tensor_model_parallel_world_size()
140
+ self.ep_size = get_moe_expert_parallel_world_size()
142
141
  self.num_experts = (
143
142
  config.num_experts + global_server_args_dict["ep_num_redundant_experts"]
144
143
  )
@@ -148,7 +147,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
148
147
  self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
149
148
  ) -> torch.Tensor:
150
149
 
151
- if not global_server_args_dict["enable_deepep_moe"]:
150
+ if not global_server_args_dict["moe_a2a_backend"].is_deepep():
152
151
  return self.forward_normal(hidden_states)
153
152
  else:
154
153
  return self.forward_deepep(hidden_states, forward_batch)
@@ -146,7 +146,7 @@ class Step3TextMoEMLP(nn.Module):
146
146
  prefix=add_prefix("gate", prefix),
147
147
  )
148
148
 
149
- if global_server_args_dict["enable_deepep_moe"]:
149
+ if global_server_args_dict["moe_a2a_backend"].is_deepep():
150
150
  raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.")
151
151
 
152
152
  def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -868,7 +868,6 @@ class Step3VLForConditionalGeneration(nn.Module):
868
868
  )
869
869
 
870
870
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
871
- # TODO:
872
871
  stacked_params_mapping = [
873
872
  # (param_name, shard_name, shard_id)
874
873
  (".qkv_proj", ".q_proj", 0),
@@ -901,9 +900,7 @@ class Step3VLForConditionalGeneration(nn.Module):
901
900
 
902
901
  for name, loaded_weight in weights:
903
902
  if "vision_model" in name:
904
- # 1.It’s not great, but let’s leave it like this for now
905
903
  name = name.replace("self_attn", "self_attn.attn")
906
- # 2.
907
904
  name = name.replace("out_proj", "proj")
908
905
 
909
906
  # TODO: support vision model
@@ -12,7 +12,6 @@ import torch
12
12
  from PIL import Image
13
13
  from transformers import BaseImageProcessorFast
14
14
 
15
- from sglang.srt.managers.mm_utils import TransportProxyTensor
16
15
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
17
16
  from sglang.srt.utils import load_audio, load_image, load_video, logger
18
17
 
@@ -218,8 +217,10 @@ class BaseMultimodalProcessor(ABC):
218
217
  kwargs["audio"] = audios
219
218
 
220
219
  processor = self._processor
221
- if hasattr(processor, "image_processor") and isinstance(
222
- processor.image_processor, BaseImageProcessorFast
220
+ if (
221
+ hasattr(processor, "image_processor")
222
+ and isinstance(processor.image_processor, BaseImageProcessorFast)
223
+ and not self.server_args.disable_fast_image_processor
223
224
  ):
224
225
  kwargs["device"] = "cuda"
225
226
  result = processor.__call__(
@@ -12,7 +12,6 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
 
15
- import re
16
15
  from typing import Dict, List, Optional, Union
17
16
 
18
17
  from sglang.srt.managers.multimodal_processor import (
@@ -38,14 +37,8 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
38
37
  self.mm_tokens = MultimodalSpecialTokens(
39
38
  image_token="<image_soft_token>",
40
39
  image_token_id=hf_config.image_token_id,
41
- image_token_regex=re.compile(
42
- r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
43
- ),
44
40
  audio_token="<audio_soft_token>",
45
41
  audio_token_id=hf_config.audio_token_id,
46
- audio_token_regex=re.compile(
47
- r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
48
- ),
49
42
  ).build(_processor)
50
43
 
51
44
  async def process_mm_data_async(
@@ -4,7 +4,7 @@ from typing import List, Optional
4
4
  import torch
5
5
 
6
6
  from sglang.srt import operations
7
- from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPConfig
7
+ from sglang.srt.layers.moe.token_dispatcher import DeepEPConfig
8
8
  from sglang.srt.model_executor.forward_batch_info import ForwardMode
9
9
  from sglang.srt.operations import Operation
10
10
 
sglang/srt/server_args.py CHANGED
@@ -24,6 +24,7 @@ import tempfile
24
24
  from typing import List, Literal, Optional, Union
25
25
 
26
26
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
27
+ from sglang.srt.layers.utils import is_sm100_supported
27
28
  from sglang.srt.lora.lora_registry import LoRARef
28
29
  from sglang.srt.reasoning_parser import ReasoningParser
29
30
  from sglang.srt.utils import (
@@ -148,6 +149,7 @@ class ServerArgs:
148
149
  max_lora_rank: Optional[int] = None
149
150
  lora_target_modules: Optional[Union[set[str], List[str]]] = None
150
151
  lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
152
+ max_loaded_loras: Optional[int] = None
151
153
  max_loras_per_batch: int = 8
152
154
  lora_backend: str = "triton"
153
155
 
@@ -171,12 +173,11 @@ class ServerArgs:
171
173
 
172
174
  # Expert parallelism
173
175
  ep_size: int = 1
174
- enable_ep_moe: bool = False
175
- enable_deepep_moe: bool = False
176
+ moe_a2a_backend: Optional[Literal["deepep"]] = None
176
177
  enable_flashinfer_cutlass_moe: bool = False
177
178
  enable_flashinfer_trtllm_moe: bool = False
178
179
  enable_flashinfer_allreduce_fusion: bool = False
179
- deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
180
+ deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
180
181
  ep_num_redundant_experts: int = 0
181
182
  ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
182
183
  init_expert_location: str = "trivial"
@@ -197,7 +198,8 @@ class ServerArgs:
197
198
  hicache_ratio: float = 2.0
198
199
  hicache_size: int = 0
199
200
  hicache_write_policy: str = "write_through_selective"
200
- hicache_io_backend: str = ""
201
+ hicache_io_backend: str = "kernel"
202
+ hicache_mem_layout: str = "layer_first"
201
203
  hicache_storage_backend: Optional[str] = None
202
204
 
203
205
  # Double Sparsity
@@ -215,7 +217,9 @@ class ServerArgs:
215
217
  disable_cuda_graph: bool = False
216
218
  disable_cuda_graph_padding: bool = False
217
219
  enable_profile_cuda_graph: bool = False
220
+ enable_cudagraph_gc: bool = False
218
221
  enable_nccl_nvls: bool = False
222
+ enable_symm_mem: bool = False
219
223
  enable_tokenizer_batch_encode: bool = False
220
224
  disable_outlines_disk_cache: bool = False
221
225
  disable_custom_all_reduce: bool = False
@@ -269,7 +273,27 @@ class ServerArgs:
269
273
  enable_pdmux: bool = False
270
274
  sm_group_num: int = 3
271
275
 
276
+ # Deprecated arguments
277
+ enable_ep_moe: bool = False
278
+ enable_deepep_moe: bool = False
279
+
272
280
  def __post_init__(self):
281
+
282
+ # Check deprecated arguments
283
+ def print_deprecated_warning(message: str):
284
+ logger.warning(f"\033[33m{message}\033[0m")
285
+
286
+ if self.enable_ep_moe:
287
+ self.ep_size = self.tp_size
288
+ print_deprecated_warning(
289
+ "NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
290
+ )
291
+ if self.enable_deepep_moe:
292
+ self.moe_a2a_backend = "deepep"
293
+ print_deprecated_warning(
294
+ "NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
295
+ )
296
+
273
297
  # Set missing default values
274
298
  if self.tokenizer_path is None:
275
299
  self.tokenizer_path = self.model_path
@@ -401,6 +425,22 @@ class ServerArgs:
401
425
  )
402
426
  self.page_size = 128
403
427
 
428
+ if self.attention_backend == "trtllm_mla":
429
+ if not is_sm100_supported():
430
+ raise ValueError(
431
+ "TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
432
+ )
433
+
434
+ if self.page_size not in [32, 64]:
435
+ logger.warning(
436
+ f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
437
+ )
438
+ self.page_size = 64
439
+ if self.speculative_algorithm is not None:
440
+ raise ValueError(
441
+ "trtllm_mla backend does not support speculative decoding yet."
442
+ )
443
+
404
444
  # Set page size
405
445
  if self.page_size is None:
406
446
  self.page_size = 1
@@ -436,13 +476,13 @@ class ServerArgs:
436
476
  self.quantization == "modelopt_fp4"
437
477
  ), "modelopt_fp4 quantization is required for Flashinfer MOE"
438
478
  os.environ["TRTLLM_ENABLE_PDL"] = "1"
439
-
440
- if self.enable_flashinfer_trtllm_moe:
441
- assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"
442
- logger.warning(f"Flashinfer TRTLLM MoE is enabled.")
479
+ assert self.ep_size in [
480
+ 1,
481
+ self.tp_size,
482
+ ], "The expert parallel size must be 1 or the same as the tensor parallel size"
443
483
 
444
484
  # DeepEP MoE
445
- if self.enable_deepep_moe:
485
+ if self.moe_a2a_backend == "deepep":
446
486
  if self.deepep_mode == "normal":
447
487
  logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
448
488
  self.disable_cuda_graph = True
@@ -466,7 +506,7 @@ class ServerArgs:
466
506
  )
467
507
 
468
508
  if self.enable_eplb:
469
- assert self.enable_ep_moe or self.enable_deepep_moe
509
+ assert self.ep_size > 1 or self.moe_a2a_backend is not None
470
510
 
471
511
  if self.enable_expert_distribution_metrics and (
472
512
  self.expert_distribution_recorder_mode is None
@@ -1131,6 +1171,7 @@ class ServerArgs:
1131
1171
  choices=[
1132
1172
  "round_robin",
1133
1173
  "shortest_queue",
1174
+ "minimum_tokens",
1134
1175
  ],
1135
1176
  )
1136
1177
 
@@ -1198,6 +1239,12 @@ class ServerArgs:
1198
1239
  default=8,
1199
1240
  help="Maximum number of adapters for a running batch, include base-only request.",
1200
1241
  )
1242
+ parser.add_argument(
1243
+ "--max-loaded-loras",
1244
+ type=int,
1245
+ default=ServerArgs.max_loaded_loras,
1246
+ help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
1247
+ )
1201
1248
  parser.add_argument(
1202
1249
  "--lora-backend",
1203
1250
  type=str,
@@ -1219,6 +1266,7 @@ class ServerArgs:
1219
1266
  "torch_native",
1220
1267
  "ascend",
1221
1268
  "triton",
1269
+ "trtllm_mla",
1222
1270
  ],
1223
1271
  default=ServerArgs.attention_backend,
1224
1272
  help="Choose the kernels for attention layers.",
@@ -1333,30 +1381,27 @@ class ServerArgs:
1333
1381
  help="The expert parallelism size.",
1334
1382
  )
1335
1383
  parser.add_argument(
1336
- "--enable-ep-moe",
1337
- action="store_true",
1338
- help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
1384
+ "--moe-a2a-backend",
1385
+ type=str,
1386
+ choices=["deepep"],
1387
+ default=ServerArgs.moe_a2a_backend,
1388
+ help="Choose the backend for MoE A2A.",
1339
1389
  )
1340
1390
  parser.add_argument(
1341
1391
  "--enable-flashinfer-cutlass-moe",
1342
1392
  action="store_true",
1343
- help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
1393
+ help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
1344
1394
  )
1345
1395
  parser.add_argument(
1346
1396
  "--enable-flashinfer-trtllm-moe",
1347
1397
  action="store_true",
1348
- help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP with --enable-ep-moe",
1398
+ help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
1349
1399
  )
1350
1400
  parser.add_argument(
1351
1401
  "--enable-flashinfer-allreduce-fusion",
1352
1402
  action="store_true",
1353
1403
  help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
1354
1404
  )
1355
- parser.add_argument(
1356
- "--enable-deepep-moe",
1357
- action="store_true",
1358
- help="Enabling DeepEP MoE implementation for EP MoE.",
1359
- )
1360
1405
  parser.add_argument(
1361
1406
  "--deepep-mode",
1362
1407
  type=str,
@@ -1467,10 +1512,18 @@ class ServerArgs:
1467
1512
  default=ServerArgs.hicache_io_backend,
1468
1513
  help="The IO backend for KV cache transfer between CPU and GPU",
1469
1514
  )
1515
+ parser.add_argument(
1516
+ "--hicache-mem-layout",
1517
+ type=str,
1518
+ choices=["layer_first", "page_first"],
1519
+ default=ServerArgs.hicache_mem_layout,
1520
+ help="The layout of host memory pool for hierarchical cache.",
1521
+ )
1522
+
1470
1523
  parser.add_argument(
1471
1524
  "--hicache-storage-backend",
1472
1525
  type=str,
1473
- choices=["file", "mooncake", "hf3fs"],
1526
+ choices=["file", "mooncake", "hf3fs", "nixl"],
1474
1527
  default=ServerArgs.hicache_storage_backend,
1475
1528
  help="The storage backend for hierarchical KV cache.",
1476
1529
  )
@@ -1545,11 +1598,21 @@ class ServerArgs:
1545
1598
  action="store_true",
1546
1599
  help="Enable profiling of cuda graph capture.",
1547
1600
  )
1601
+ parser.add_argument(
1602
+ "--enable-cudagraph-gc",
1603
+ action="store_true",
1604
+ help="Enable garbage collection during CUDA graph capture. If disabled (default), GC is frozen during capture to speed up the process.",
1605
+ )
1548
1606
  parser.add_argument(
1549
1607
  "--enable-nccl-nvls",
1550
1608
  action="store_true",
1551
1609
  help="Enable NCCL NVLS for prefill heavy requests when available.",
1552
1610
  )
1611
+ parser.add_argument(
1612
+ "--enable-symm-mem",
1613
+ action="store_true",
1614
+ help="Enable NCCL symmetric memory for fast collectives.",
1615
+ )
1553
1616
  parser.add_argument(
1554
1617
  "--enable-tokenizer-batch-encode",
1555
1618
  action="store_true",
@@ -1805,6 +1868,18 @@ class ServerArgs:
1805
1868
  help="Disable mmap while loading weight using safetensors.",
1806
1869
  )
1807
1870
 
1871
+ # Deprecated arguments
1872
+ parser.add_argument(
1873
+ "--enable-ep-moe",
1874
+ action="store_true",
1875
+ help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
1876
+ )
1877
+ parser.add_argument(
1878
+ "--enable-deepep-moe",
1879
+ action="store_true",
1880
+ help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
1881
+ )
1882
+
1808
1883
  @classmethod
1809
1884
  def from_cli_args(cls, args: argparse.Namespace):
1810
1885
  args.tp_size = args.tensor_parallel_size
@@ -1861,6 +1936,12 @@ class ServerArgs:
1861
1936
  if "Llama4" in model_arch:
1862
1937
  assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
1863
1938
 
1939
+ if "Gemma2ForCausalLM" in model_arch:
1940
+ # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
1941
+ # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
1942
+ logger.warning("Disable hybrid SWA memory for Gemma2ForCausalLM.")
1943
+ self.disable_hybrid_swa_memory = True
1944
+
1864
1945
  # Check LoRA
1865
1946
  self.check_lora_server_args()
1866
1947
 
@@ -1935,6 +2016,19 @@ class ServerArgs:
1935
2016
  self.max_lora_rank and self.lora_target_modules
1936
2017
  ), "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."
1937
2018
 
2019
+ # Validate max_loaded_loras
2020
+ if self.max_loaded_loras is not None:
2021
+ assert self.max_loaded_loras >= self.max_loras_per_batch, (
2022
+ "max_loaded_loras should be greater than or equal to max_loras_per_batch. "
2023
+ f"max_loaded_loras={self.max_loaded_loras}, max_loras_per_batch={self.max_loras_per_batch}"
2024
+ )
2025
+ assert (
2026
+ not self.lora_paths or len(self.lora_paths) <= self.max_loaded_loras
2027
+ ), (
2028
+ "The number of LoRA paths should not exceed max_loaded_loras. "
2029
+ f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
2030
+ )
2031
+
1938
2032
  def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
1939
2033
  larger_tp = max(decode_tp, prefill_tp)
1940
2034
  smaller_tp = min(decode_tp, prefill_tp)
@@ -142,6 +142,22 @@ class EAGLEDraftExtendCudaGraphRunner:
142
142
  self.global_num_tokens_for_logprob_gpu = None
143
143
  self.gathered_buffer = None
144
144
 
145
+ if hasattr(
146
+ self.model_runner.model_config.hf_config, "draft_vocab_size"
147
+ ): # llama_eagle
148
+ vocab_size = self.model_runner.model_config.hf_config.draft_vocab_size
149
+ elif hasattr(
150
+ self.model_runner.model_config.hf_config, "hot_vocab_size"
151
+ ): # llama_eagle3
152
+ vocab_size = self.model_runner.model_config.hf_config.hot_vocab_size
153
+ else:
154
+ vocab_size = self.model_runner.model_config.vocab_size
155
+
156
+ self.next_token_logits_buffer = torch.zeros(
157
+ (self.max_bs, vocab_size),
158
+ dtype=torch.float,
159
+ )
160
+
145
161
  # Capture
146
162
  try:
147
163
  with model_capture_mode():
@@ -189,6 +205,7 @@ class EAGLEDraftExtendCudaGraphRunner:
189
205
  out_cache_loc = self.out_cache_loc[:num_tokens]
190
206
  positions = self.positions[:num_tokens]
191
207
  hidden_states = self.hidden_states[:num_tokens]
208
+ next_token_logits_buffer = self.next_token_logits_buffer[:bs]
192
209
 
193
210
  if self.require_mlp_tp_gather:
194
211
  self.global_num_tokens_gpu.copy_(
@@ -238,6 +255,7 @@ class EAGLEDraftExtendCudaGraphRunner:
238
255
  input_ids=input_ids,
239
256
  req_pool_indices=req_pool_indices,
240
257
  seq_lens=seq_lens,
258
+ next_token_logits_buffer=next_token_logits_buffer,
241
259
  req_to_token_pool=self.model_runner.req_to_token_pool,
242
260
  token_to_kv_pool=self.model_runner.token_to_kv_pool,
243
261
  out_cache_loc=out_cache_loc,
@@ -13,17 +13,18 @@ from sglang.srt.layers.communicator import (
13
13
  CommunicateSummableTensorPairFn,
14
14
  ScatterMode,
15
15
  )
16
- from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
16
+ from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher
17
+ from sglang.srt.layers.moe.utils import DeepEPMode
17
18
  from sglang.srt.layers.quantization import deep_gemm_wrapper
18
19
  from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
19
20
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
20
21
  from sglang.srt.operations import execute_operations, execute_overlapped_operations
21
22
  from sglang.srt.operations_strategy import OperationsStrategy
22
23
  from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
23
- from sglang.srt.utils import BumpAllocator, DeepEPMode, get_bool_env_var
24
+ from sglang.srt.utils import BumpAllocator, get_bool_env_var
24
25
 
25
26
  if TYPE_CHECKING:
26
- from sglang.srt.layers.moe.ep_moe.token_dispatcher import DispatchOutput
27
+ from sglang.srt.layers.moe.token_dispatcher import DispatchOutput
27
28
 
28
29
  _tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
29
30
 
@@ -310,7 +311,7 @@ class TboDPAttentionPreparer:
310
311
  and not local_batch.forward_mode.is_target_verify()
311
312
  )
312
313
  and enable_deepep_moe
313
- and (resolved_deepep_mode == DeepEPMode.low_latency)
314
+ and (resolved_deepep_mode == DeepEPMode.LOW_LATENCY)
314
315
  )
315
316
  else:
316
317
  self.local_tbo_split_seq_index = 0
@@ -563,6 +564,7 @@ class TboForwardBatchPreparer:
563
564
  mm_inputs=None,
564
565
  top_logprobs_nums=None,
565
566
  token_ids_logprobs=None,
567
+ next_token_logits_buffer=None,
566
568
  )
567
569
  )
568
570
 
sglang/srt/utils.py CHANGED
@@ -44,7 +44,6 @@ import traceback
44
44
  import warnings
45
45
  from collections import OrderedDict, defaultdict
46
46
  from contextlib import contextmanager
47
- from enum import Enum
48
47
  from functools import lru_cache
49
48
  from importlib.metadata import PackageNotFoundError, version
50
49
  from importlib.util import find_spec
@@ -93,6 +92,7 @@ logger = logging.getLogger(__name__)
93
92
  show_time_cost = False
94
93
  time_infos = {}
95
94
 
95
+
96
96
  HIP_FP8_E4M3_FNUZ_MAX = 224.0
97
97
 
98
98
 
@@ -2205,27 +2205,6 @@ def flatten_nested_list(nested_list):
2205
2205
  return [nested_list]
2206
2206
 
2207
2207
 
2208
- class DeepEPMode(Enum):
2209
- normal = "normal"
2210
- low_latency = "low_latency"
2211
- auto = "auto"
2212
-
2213
- def enable_normal(self):
2214
- return self in [DeepEPMode.normal, DeepEPMode.auto]
2215
-
2216
- def enable_low_latency(self):
2217
- return self in [DeepEPMode.low_latency, DeepEPMode.auto]
2218
-
2219
- def resolve(self, is_extend_in_batch: bool):
2220
- if self != DeepEPMode.auto:
2221
- return self
2222
-
2223
- if is_extend_in_batch:
2224
- return DeepEPMode.normal
2225
- else:
2226
- return DeepEPMode.low_latency
2227
-
2228
-
2229
2208
  def is_non_idle_and_non_empty(forward_mode, hidden_states):
2230
2209
  return (
2231
2210
  (forward_mode is not None)
@@ -2344,6 +2323,7 @@ def is_fa3_default_architecture(hf_config):
2344
2323
  "Qwen3ForCausalLM",
2345
2324
  "Qwen3MoeForCausalLM",
2346
2325
  "Glm4MoeForCausalLM",
2326
+ "Step3VLForConditionalGeneration",
2347
2327
  }
2348
2328
  return architectures[0] in default_archs
2349
2329
 
@@ -2413,7 +2393,7 @@ def require_mlp_tp_gather(server_args):
2413
2393
  return True
2414
2394
  elif not server_args.enable_dp_lm_head:
2415
2395
  return True
2416
- elif not server_args.enable_deepep_moe:
2396
+ elif server_args.moe_a2a_backend is None:
2417
2397
  return True
2418
2398
  else:
2419
2399
  return (
@@ -2429,7 +2409,7 @@ def require_attn_tp_gather(server_args):
2429
2409
  Check if the input of attention is scattered.
2430
2410
  """
2431
2411
  assert server_args.moe_dense_tp_size in [1, None]
2432
- if server_args.enable_deepep_moe or server_args.moe_dense_tp_size == 1:
2412
+ if server_args.moe_a2a_backend is not None or server_args.moe_dense_tp_size == 1:
2433
2413
  if server_args.enable_dp_attention:
2434
2414
  return server_args.dp_size < server_args.tp_size
2435
2415
  else:
@@ -45,7 +45,7 @@ async def update_weights(
45
45
  (
46
46
  name,
47
47
  MultiprocessingSerializer.serialize(
48
- _preprocess_tensor_for_update_weights(tensor)
48
+ _preprocess_tensor_for_update_weights(tensor.detach())
49
49
  ),
50
50
  )
51
51
  for name, tensor in params_batch