sglang 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/srt/configs/__init__.py +8 -0
  3. sglang/srt/configs/model_config.py +6 -0
  4. sglang/srt/configs/step3_vl.py +172 -0
  5. sglang/srt/conversation.py +23 -0
  6. sglang/srt/disaggregation/decode.py +2 -8
  7. sglang/srt/disaggregation/prefill.py +2 -6
  8. sglang/srt/distributed/parallel_state.py +86 -1
  9. sglang/srt/entrypoints/engine.py +14 -18
  10. sglang/srt/entrypoints/http_server.py +23 -3
  11. sglang/srt/entrypoints/openai/protocol.py +3 -1
  12. sglang/srt/entrypoints/openai/serving_base.py +5 -2
  13. sglang/srt/entrypoints/openai/serving_chat.py +2 -21
  14. sglang/srt/eplb/expert_distribution.py +5 -0
  15. sglang/srt/eplb/expert_location.py +17 -6
  16. sglang/srt/eplb/expert_location_dispatch.py +1 -0
  17. sglang/srt/eplb/expert_location_updater.py +2 -0
  18. sglang/srt/function_call/function_call_parser.py +2 -0
  19. sglang/srt/function_call/step3_detector.py +436 -0
  20. sglang/srt/hf_transformers_utils.py +2 -0
  21. sglang/srt/jinja_template_utils.py +4 -1
  22. sglang/srt/layers/moe/cutlass_moe.py +2 -1
  23. sglang/srt/layers/moe/ep_moe/layer.py +98 -603
  24. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
  25. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
  28. sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
  29. sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
  30. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
  31. sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
  32. sglang/srt/layers/moe/topk.py +6 -2
  33. sglang/srt/layers/quantization/fp8.py +0 -18
  34. sglang/srt/layers/quantization/modelopt_quant.py +2 -0
  35. sglang/srt/layers/quantization/unquant.py +0 -8
  36. sglang/srt/layers/quantization/w4afp8.py +1 -0
  37. sglang/srt/managers/cache_controller.py +143 -45
  38. sglang/srt/managers/data_parallel_controller.py +6 -0
  39. sglang/srt/managers/io_struct.py +12 -2
  40. sglang/srt/managers/scheduler.py +116 -669
  41. sglang/srt/managers/scheduler_input_blocker.py +106 -0
  42. sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
  43. sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
  44. sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
  45. sglang/srt/managers/template_manager.py +62 -19
  46. sglang/srt/managers/tokenizer_manager.py +166 -83
  47. sglang/srt/managers/tp_worker.py +9 -0
  48. sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
  49. sglang/srt/mem_cache/hicache_storage.py +45 -11
  50. sglang/srt/mem_cache/hiradix_cache.py +15 -4
  51. sglang/srt/mem_cache/memory_pool_host.py +73 -1
  52. sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
  53. sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
  54. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
  55. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
  56. sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
  57. sglang/srt/model_executor/model_runner.py +20 -13
  58. sglang/srt/models/arcee.py +532 -0
  59. sglang/srt/models/deepseek_v2.py +15 -56
  60. sglang/srt/models/glm4_moe.py +3 -1
  61. sglang/srt/models/granitemoe.py +3 -0
  62. sglang/srt/models/grok.py +3 -0
  63. sglang/srt/models/hunyuan.py +1 -0
  64. sglang/srt/models/llama4.py +3 -0
  65. sglang/srt/models/mixtral.py +3 -0
  66. sglang/srt/models/olmoe.py +3 -0
  67. sglang/srt/models/phimoe.py +1 -0
  68. sglang/srt/models/qwen3_moe.py +12 -69
  69. sglang/srt/models/step3_vl.py +994 -0
  70. sglang/srt/multimodal/processors/base_processor.py +15 -16
  71. sglang/srt/multimodal/processors/step3_vl.py +515 -0
  72. sglang/srt/poll_based_barrier.py +31 -0
  73. sglang/srt/reasoning_parser.py +2 -1
  74. sglang/srt/server_args.py +18 -13
  75. sglang/srt/speculative/eagle_worker.py +2 -0
  76. sglang/srt/two_batch_overlap.py +8 -3
  77. sglang/test/test_utils.py +53 -0
  78. sglang/utils.py +0 -11
  79. sglang/version.py +1 -1
  80. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/METADATA +4 -4
  81. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/RECORD +84 -64
  82. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
  83. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
  84. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
@@ -7,11 +7,16 @@ from typing import List, Optional, Tuple
7
7
  import torch
8
8
 
9
9
  from sglang.srt.distributed import (
10
+ get_moe_expert_parallel_rank,
11
+ get_moe_expert_parallel_world_size,
12
+ get_moe_tensor_parallel_rank,
13
+ get_moe_tensor_parallel_world_size,
10
14
  get_tensor_model_parallel_rank,
11
15
  get_tensor_model_parallel_world_size,
12
16
  tensor_model_parallel_all_reduce,
13
17
  )
14
- from sglang.srt.layers.moe.topk import TopKOutput
18
+ from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
19
+ from sglang.srt.layers.moe.topk import StandardTopKOutput
15
20
  from sglang.srt.layers.quantization.base_config import (
16
21
  QuantizationConfig,
17
22
  QuantizeMethodBase,
@@ -62,8 +67,9 @@ class FusedMoE(torch.nn.Module):
62
67
  num_experts: int,
63
68
  hidden_size: int,
64
69
  intermediate_size: int,
70
+ layer_id: int,
65
71
  top_k: Optional[int] = None,
66
- layer_id: Optional[int] = None,
72
+ num_fused_shared_experts: int = 0,
67
73
  params_dtype: Optional[torch.dtype] = None,
68
74
  reduce_results: bool = False,
69
75
  quant_config: Optional[QuantizationConfig] = None,
@@ -77,21 +83,19 @@ class FusedMoE(torch.nn.Module):
77
83
  routed_scaling_factor: Optional[float] = None,
78
84
  enable_flashinfer_cutlass_moe: Optional[bool] = False,
79
85
  enable_ep_moe: Optional[bool] = False,
80
- skip_quant: Optional[bool] = False,
81
86
  ):
82
87
  super().__init__()
83
88
 
84
89
  if params_dtype is None:
85
90
  params_dtype = torch.get_default_dtype()
86
91
 
92
+ self.layer_id = layer_id
87
93
  self.top_k = top_k
88
94
  self.hidden_size = hidden_size
89
- self.tp_size = (
90
- tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
91
- )
92
- self.tp_rank = get_tensor_model_parallel_rank()
93
95
  self.num_experts = num_experts
94
- self.expert_map = None
96
+ self.num_fused_shared_experts = num_fused_shared_experts
97
+ self.expert_map_cpu = None
98
+ self.expert_map_gpu = None
95
99
 
96
100
  if enable_flashinfer_cutlass_moe and quant_config is None:
97
101
  logger.warning("Disable flashinfer MoE when quantization config is None.")
@@ -99,28 +103,27 @@ class FusedMoE(torch.nn.Module):
99
103
  enable_ep_moe = False
100
104
 
101
105
  self.enable_flashinfer_cutlass_moe = enable_flashinfer_cutlass_moe
106
+ self.moe_ep_size = get_moe_expert_parallel_world_size()
107
+ self.moe_ep_rank = get_moe_expert_parallel_rank()
108
+ self.moe_tp_size = get_moe_tensor_parallel_world_size()
109
+ self.moe_tp_rank = get_moe_tensor_parallel_rank()
110
+ assert num_experts % self.moe_ep_size == 0
111
+ self.num_local_experts = num_experts // self.moe_ep_size
102
112
  if enable_ep_moe:
103
- self.ep_size = self.tp_size
104
- self.ep_rank = self.tp_rank
105
- self.tp_size = 1
106
- self.tp_rank = 0
113
+ # TODO(ch-wan): support shared experts fusion
107
114
  # Create a tensor of size num_experts filled with -1
108
- self.expert_map = torch.full((self.num_experts,), -1, dtype=torch.int32)
115
+ self.expert_map_cpu = torch.full((self.num_experts,), -1, dtype=torch.int32)
109
116
  # Create a expert map for the local experts
110
- assert num_experts % self.ep_size == 0
111
- self.num_local_experts = num_experts // self.ep_size
112
- self.expert_map[
113
- self.ep_rank
114
- * self.num_local_experts : (self.ep_rank + 1)
117
+ self.expert_map_cpu[
118
+ self.moe_ep_rank
119
+ * self.num_local_experts : (self.moe_ep_rank + 1)
115
120
  * self.num_local_experts
116
121
  ] = torch.arange(0, self.num_local_experts, dtype=torch.int32, device="cpu")
117
- else:
118
- self.ep_size = 1
119
- self.ep_rank = 0
120
- self.num_local_experts = num_experts
122
+ self.expert_map_gpu = self.expert_map_cpu.to(device="cuda")
123
+
121
124
  self.routed_scaling_factor = routed_scaling_factor
122
- assert intermediate_size % self.tp_size == 0
123
- self.intermediate_size_per_partition = intermediate_size // self.tp_size
125
+ assert intermediate_size % self.moe_tp_size == 0
126
+ self.intermediate_size_per_partition = intermediate_size // self.moe_tp_size
124
127
  self.reduce_results = reduce_results
125
128
  self.activation = activation
126
129
  self.apply_router_weight_on_input = apply_router_weight_on_input
@@ -132,9 +135,6 @@ class FusedMoE(torch.nn.Module):
132
135
  not _is_cpu and global_server_args_dict["enable_triton_kernel_moe"]
133
136
  )
134
137
 
135
- if skip_quant:
136
- return
137
-
138
138
  if quant_config is None:
139
139
  self.quant_method: Optional[QuantizeMethodBase] = UnquantizedFusedMoEMethod(
140
140
  self.use_triton_kernels
@@ -363,9 +363,9 @@ class FusedMoE(torch.nn.Module):
363
363
  expert_data.copy_(loaded_weight)
364
364
 
365
365
  def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
366
- if self.expert_map is None:
366
+ if self.expert_map_cpu is None:
367
367
  return expert_id
368
- return self.expert_map[expert_id].item()
368
+ return self.expert_map_cpu[expert_id].item()
369
369
 
370
370
  def weight_loader(
371
371
  self,
@@ -375,10 +375,48 @@ class FusedMoE(torch.nn.Module):
375
375
  shard_id: str,
376
376
  expert_id: int,
377
377
  ) -> None:
378
+
379
+ global_expert_location_metadata = get_global_expert_location_metadata()
380
+ if global_expert_location_metadata is None:
381
+ self._weight_loader_impl(
382
+ param=param,
383
+ loaded_weight=loaded_weight,
384
+ weight_name=weight_name,
385
+ shard_id=shard_id,
386
+ expert_id=expert_id,
387
+ )
388
+ return
389
+
390
+ if expert_id >= self.num_experts - self.num_fused_shared_experts:
391
+ # This is a shared expert.
392
+ physical_expert_ids = [expert_id]
393
+ else:
394
+ physical_expert_ids = (
395
+ global_expert_location_metadata.logical_to_all_physical(
396
+ self.layer_id, expert_id
397
+ )
398
+ )
399
+
400
+ for physical_expert_id in physical_expert_ids:
401
+ self._weight_loader_physical(
402
+ param=param,
403
+ loaded_weight=loaded_weight,
404
+ weight_name=weight_name,
405
+ shard_id=shard_id,
406
+ expert_id=physical_expert_id,
407
+ )
408
+
409
+ def _weight_loader_physical(
410
+ self,
411
+ param: torch.nn.Parameter,
412
+ loaded_weight: torch.Tensor,
413
+ weight_name: str,
414
+ shard_id: str,
415
+ expert_id: int,
416
+ ) -> None:
378
417
  expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
379
418
  if expert_id == -1:
380
419
  return
381
-
382
420
  self._weight_loader_impl(
383
421
  param=param,
384
422
  loaded_weight=loaded_weight,
@@ -396,8 +434,7 @@ class FusedMoE(torch.nn.Module):
396
434
  expert_id: int,
397
435
  ) -> None:
398
436
 
399
- # TP rank is set to 0 if EP is enabled
400
- tp_rank = 0 if self.ep_size > 1 else get_tensor_model_parallel_rank()
437
+ tp_rank = self.moe_tp_rank
401
438
 
402
439
  # compressed-tensors checkpoints with packed weights are stored flipped
403
440
  # TODO (mgoin): check self.quant_method.quant_config.quant_format
@@ -571,9 +608,14 @@ class FusedMoE(torch.nn.Module):
571
608
  )
572
609
  return
573
610
 
574
- def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
611
+ def forward(self, hidden_states: torch.Tensor, topk_output: StandardTopKOutput):
575
612
  assert self.quant_method is not None
576
613
 
614
+ if self.expert_map_gpu is not None:
615
+ topk_output = topk_output._replace(
616
+ topk_ids=self.expert_map_gpu[topk_output.topk_ids]
617
+ )
618
+
577
619
  # Matrix multiply.
578
620
  final_hidden_states = self.quant_method.apply(
579
621
  layer=self,
@@ -584,17 +626,17 @@ class FusedMoE(torch.nn.Module):
584
626
  routed_scaling_factor=self.routed_scaling_factor,
585
627
  **(
586
628
  dict(
587
- tp_rank=self.tp_rank,
588
- tp_size=self.tp_size,
589
- ep_rank=self.ep_rank,
590
- ep_size=self.ep_size,
629
+ tp_rank=self.moe_tp_rank,
630
+ tp_size=self.moe_tp_size,
631
+ ep_rank=self.moe_ep_rank,
632
+ ep_size=self.moe_ep_size,
591
633
  )
592
634
  if self.quant_method.__class__.__name__ == "ModelOptNvFp4FusedMoEMethod"
593
635
  else {}
594
636
  ),
595
637
  )
596
638
 
597
- if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
639
+ if self.reduce_results and (self.moe_tp_size > 1 or self.moe_ep_size > 1):
598
640
  final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
599
641
 
600
642
  return final_hidden_states
@@ -627,3 +669,20 @@ class FusedMoE(torch.nn.Module):
627
669
  ("w3", ckpt_up_proj_name),
628
670
  ]
629
671
  ]
672
+
673
+ @classmethod
674
+ def make_expert_input_scale_params_mapping(
675
+ cls,
676
+ num_experts: int,
677
+ ) -> List[Tuple[str, str, int, str]]:
678
+ # (param_name, weight_name, expert_id, shard_id)
679
+ return [
680
+ (
681
+ "experts.w13_" if shard_id in ["w1", "w3"] else "experts.w2_",
682
+ f"experts.{expert_id}.{shard_id}.",
683
+ expert_id,
684
+ shard_id,
685
+ )
686
+ for expert_id in range(num_experts)
687
+ for shard_id in ["w1", "w2", "w3"]
688
+ ]
File without changes
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from enum import Enum, auto
5
+ from typing import TYPE_CHECKING, NamedTuple, Protocol, runtime_checkable
6
+
7
+ import torch
8
+
9
+
10
+ class DispatchOutputFormat(Enum):
11
+ standard = auto()
12
+ deepep_normal = auto()
13
+ deepep_ll = auto()
14
+
15
+ def is_standard(self) -> bool:
16
+ return self == DispatchOutputFormat.standard
17
+
18
+ def is_deepep_normal(self) -> bool:
19
+ return self == DispatchOutputFormat.deepep_normal
20
+
21
+ def is_deepep_ll(self) -> bool:
22
+ return self == DispatchOutputFormat.deepep_ll
23
+
24
+
25
+ @runtime_checkable
26
+ class DispatchOutput(Protocol):
27
+ """Protocol for dispatch outputs in different formats."""
28
+
29
+ @property
30
+ def format(self) -> DispatchOutputFormat: ...
31
+
32
+
33
+ class BaseDispatcherConfig(ABC):
34
+ """Base class for dispatcher configs."""
35
+
36
+ pass
37
+
38
+
39
+ class BaseDispatcher(ABC):
40
+ """Base class for dispatchers."""
41
+
42
+ @abstractmethod
43
+ def dispatch(self, *args, **kwargs) -> DispatchOutput:
44
+ pass
45
+
46
+ @abstractmethod
47
+ def combine(self, *args, **kwargs) -> torch.Tensor:
48
+ pass
@@ -0,0 +1,19 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import NamedTuple
4
+
5
+ from sglang.srt.layers.moe.token_dispatcher.base_dispatcher import (
6
+ DispatchOutput,
7
+ DispatchOutputFormat,
8
+ )
9
+
10
+
11
+ class StandardDispatchOutput(NamedTuple):
12
+ """Standard dispatch output."""
13
+
14
+ @property
15
+ def format(self) -> DispatchOutputFormat:
16
+ return DispatchOutputFormat.standard
17
+
18
+
19
+ assert isinstance(StandardDispatchOutput, DispatchOutput)
@@ -397,7 +397,9 @@ def grouped_topk_gpu(
397
397
  .reshape(num_token, -1)
398
398
  ) # [n, e]
399
399
  tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
400
- topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
400
+ topk_weights, topk_ids = torch.topk(
401
+ tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0
402
+ )
401
403
  if num_fused_shared_experts:
402
404
  topk_ids[:, -1] = torch.randint(
403
405
  low=num_experts,
@@ -486,7 +488,9 @@ def biased_grouped_topk_impl(
486
488
  tmp_scores = scores_for_choice.masked_fill(
487
489
  ~score_mask.bool(), float("-inf")
488
490
  ) # [n, e]
489
- _, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
491
+ _, topk_ids = torch.topk(
492
+ tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0
493
+ )
490
494
  topk_weights = scores.gather(1, topk_ids)
491
495
 
492
496
  if num_fused_shared_experts:
@@ -172,7 +172,6 @@ class Fp8Config(QuantizationConfig):
172
172
  self, layer: torch.nn.Module, prefix: str
173
173
  ) -> Optional[QuantizeMethodBase]:
174
174
  from sglang.srt.layers.linear import LinearBase
175
- from sglang.srt.layers.moe.ep_moe.layer import EPMoE
176
175
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
177
176
 
178
177
  if isinstance(layer, LinearBase):
@@ -181,8 +180,6 @@ class Fp8Config(QuantizationConfig):
181
180
  return Fp8LinearMethod(self)
182
181
  elif isinstance(layer, FusedMoE):
183
182
  return Fp8MoEMethod(self)
184
- elif isinstance(layer, EPMoE):
185
- return Fp8EPMoEMethod(self)
186
183
  return None
187
184
 
188
185
  def get_scaled_act_names(self) -> List[str]:
@@ -984,23 +981,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
984
981
  no_combine: bool = False,
985
982
  routed_scaling_factor: Optional[float] = None,
986
983
  ) -> torch.Tensor:
987
- from sglang.srt.layers.moe.ep_moe.layer import EPMoE
988
984
  from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
989
985
 
990
- if isinstance(layer, EPMoE):
991
- layer.w13_weight_scale = (
992
- layer.w13_weight_scale_inv
993
- if self.block_quant
994
- else layer.w13_weight_scale
995
- )
996
- layer.w2_weight_scale = (
997
- layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale
998
- )
999
- return layer.run_moe(
1000
- hidden_states=x,
1001
- topk_output=topk_output,
1002
- )
1003
-
1004
986
  if use_intel_amx_backend(layer):
1005
987
  from sglang.srt.layers.moe.topk import apply_topk_weights_cpu
1006
988
 
@@ -900,6 +900,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
900
900
  layer.w13_blockscale_swizzled = Parameter(
901
901
  w13_blockscale_swizzled, requires_grad=False
902
902
  )
903
+ del layer.w13_weight_scale
903
904
 
904
905
  # This is for quantization, so we need to invert it.
905
906
  layer.w13_input_scale_quant = Parameter(
@@ -935,6 +936,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
935
936
  layer.w2_blockscale_swizzled = Parameter(
936
937
  w2_blockscale_swizzled, requires_grad=False
937
938
  )
939
+ del layer.w2_weight_scale
938
940
  layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
939
941
 
940
942
  device = layer.w13_weight.device
@@ -204,14 +204,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
204
204
  routed_scaling_factor: Optional[float] = None,
205
205
  ) -> torch.Tensor:
206
206
 
207
- from sglang.srt.layers.moe.ep_moe.layer import EPMoE
208
-
209
- if isinstance(layer, EPMoE):
210
- return layer.run_moe(
211
- hidden_states=x,
212
- topk_output=topk_output,
213
- )
214
-
215
207
  return self.forward(
216
208
  x=x,
217
209
  layer=layer,
@@ -276,6 +276,7 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
276
276
  layer: EPMoE,
277
277
  hidden_states: torch.Tensor,
278
278
  topk_output: TopKOutput,
279
+ **kwargs,
279
280
  ) -> torch.Tensor:
280
281
 
281
282
  # TODO(ch-wan): move it out of this class