sglang 0.5.0rc2__py3-none-any.whl → 0.5.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. sglang/bench_one_batch.py +0 -6
  2. sglang/bench_one_batch_server.py +7 -2
  3. sglang/bench_serving.py +3 -3
  4. sglang/eval/llama3_eval.py +0 -1
  5. sglang/srt/configs/model_config.py +24 -9
  6. sglang/srt/configs/update_config.py +40 -5
  7. sglang/srt/constrained/xgrammar_backend.py +23 -11
  8. sglang/srt/conversation.py +2 -15
  9. sglang/srt/disaggregation/ascend/conn.py +1 -3
  10. sglang/srt/disaggregation/base/conn.py +1 -0
  11. sglang/srt/disaggregation/decode.py +1 -1
  12. sglang/srt/disaggregation/launch_lb.py +7 -1
  13. sglang/srt/disaggregation/mini_lb.py +11 -5
  14. sglang/srt/disaggregation/mooncake/conn.py +141 -47
  15. sglang/srt/disaggregation/prefill.py +261 -5
  16. sglang/srt/disaggregation/utils.py +2 -1
  17. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
  18. sglang/srt/distributed/device_communicators/pynccl.py +68 -18
  19. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
  20. sglang/srt/distributed/naive_distributed.py +112 -0
  21. sglang/srt/distributed/parallel_state.py +90 -4
  22. sglang/srt/entrypoints/context.py +20 -1
  23. sglang/srt/entrypoints/engine.py +27 -2
  24. sglang/srt/entrypoints/http_server.py +12 -0
  25. sglang/srt/entrypoints/openai/protocol.py +2 -2
  26. sglang/srt/entrypoints/openai/serving_chat.py +22 -6
  27. sglang/srt/entrypoints/openai/serving_completions.py +9 -1
  28. sglang/srt/entrypoints/openai/serving_responses.py +2 -2
  29. sglang/srt/eplb/expert_distribution.py +2 -3
  30. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  31. sglang/srt/hf_transformers_utils.py +24 -0
  32. sglang/srt/host_shared_memory.py +83 -0
  33. sglang/srt/layers/attention/ascend_backend.py +132 -22
  34. sglang/srt/layers/attention/flashattention_backend.py +24 -17
  35. sglang/srt/layers/attention/flashinfer_backend.py +11 -3
  36. sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
  37. sglang/srt/layers/attention/triton_backend.py +85 -46
  38. sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
  39. sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
  40. sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
  41. sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
  42. sglang/srt/layers/attention/utils.py +94 -15
  43. sglang/srt/layers/attention/vision.py +40 -13
  44. sglang/srt/layers/attention/vision_utils.py +65 -0
  45. sglang/srt/layers/communicator.py +51 -3
  46. sglang/srt/layers/dp_attention.py +23 -4
  47. sglang/srt/layers/elementwise.py +94 -0
  48. sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
  49. sglang/srt/layers/layernorm.py +8 -1
  50. sglang/srt/layers/linear.py +24 -0
  51. sglang/srt/layers/logits_processor.py +5 -1
  52. sglang/srt/layers/moe/__init__.py +31 -0
  53. sglang/srt/layers/moe/ep_moe/layer.py +37 -33
  54. sglang/srt/layers/moe/fused_moe_native.py +14 -25
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
  60. sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
  61. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
  62. sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
  63. sglang/srt/layers/moe/moe_runner/base.py +13 -0
  64. sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
  65. sglang/srt/layers/moe/router.py +15 -9
  66. sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
  67. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
  68. sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
  69. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  70. sglang/srt/layers/moe/topk.py +167 -83
  71. sglang/srt/layers/moe/utils.py +159 -18
  72. sglang/srt/layers/quantization/__init__.py +13 -14
  73. sglang/srt/layers/quantization/awq.py +7 -7
  74. sglang/srt/layers/quantization/base_config.py +2 -6
  75. sglang/srt/layers/quantization/blockwise_int8.py +4 -12
  76. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
  77. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -0
  78. sglang/srt/layers/quantization/fp8.py +127 -119
  79. sglang/srt/layers/quantization/fp8_kernel.py +195 -24
  80. sglang/srt/layers/quantization/fp8_utils.py +34 -9
  81. sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
  82. sglang/srt/layers/quantization/gptq.py +5 -4
  83. sglang/srt/layers/quantization/marlin_utils.py +11 -3
  84. sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
  85. sglang/srt/layers/quantization/modelopt_quant.py +165 -68
  86. sglang/srt/layers/quantization/moe_wna16.py +10 -15
  87. sglang/srt/layers/quantization/mxfp4.py +206 -37
  88. sglang/srt/layers/quantization/quark/quark.py +390 -0
  89. sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
  90. sglang/srt/layers/quantization/unquant.py +34 -70
  91. sglang/srt/layers/quantization/utils.py +25 -0
  92. sglang/srt/layers/quantization/w4afp8.py +7 -8
  93. sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
  94. sglang/srt/layers/quantization/w8a8_int8.py +5 -13
  95. sglang/srt/layers/radix_attention.py +6 -0
  96. sglang/srt/layers/rotary_embedding.py +1 -0
  97. sglang/srt/lora/lora_manager.py +21 -22
  98. sglang/srt/lora/lora_registry.py +3 -3
  99. sglang/srt/lora/mem_pool.py +26 -24
  100. sglang/srt/lora/utils.py +10 -12
  101. sglang/srt/managers/cache_controller.py +76 -18
  102. sglang/srt/managers/detokenizer_manager.py +10 -2
  103. sglang/srt/managers/io_struct.py +9 -0
  104. sglang/srt/managers/mm_utils.py +1 -1
  105. sglang/srt/managers/schedule_batch.py +4 -9
  106. sglang/srt/managers/scheduler.py +25 -16
  107. sglang/srt/managers/session_controller.py +1 -1
  108. sglang/srt/managers/template_manager.py +7 -5
  109. sglang/srt/managers/tokenizer_manager.py +60 -21
  110. sglang/srt/managers/tp_worker.py +1 -0
  111. sglang/srt/managers/utils.py +59 -1
  112. sglang/srt/mem_cache/allocator.py +7 -5
  113. sglang/srt/mem_cache/allocator_ascend.py +0 -11
  114. sglang/srt/mem_cache/hicache_storage.py +14 -4
  115. sglang/srt/mem_cache/memory_pool.py +3 -3
  116. sglang/srt/mem_cache/memory_pool_host.py +35 -2
  117. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
  118. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
  119. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
  120. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
  121. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
  122. sglang/srt/model_executor/cuda_graph_runner.py +25 -12
  123. sglang/srt/model_executor/forward_batch_info.py +4 -1
  124. sglang/srt/model_executor/model_runner.py +43 -32
  125. sglang/srt/model_executor/npu_graph_runner.py +94 -0
  126. sglang/srt/model_loader/loader.py +24 -6
  127. sglang/srt/models/dbrx.py +12 -6
  128. sglang/srt/models/deepseek.py +2 -1
  129. sglang/srt/models/deepseek_nextn.py +3 -1
  130. sglang/srt/models/deepseek_v2.py +224 -223
  131. sglang/srt/models/ernie4.py +2 -2
  132. sglang/srt/models/glm4_moe.py +25 -63
  133. sglang/srt/models/glm4v.py +52 -1
  134. sglang/srt/models/glm4v_moe.py +8 -11
  135. sglang/srt/models/gpt_oss.py +34 -74
  136. sglang/srt/models/granitemoe.py +0 -1
  137. sglang/srt/models/grok.py +375 -51
  138. sglang/srt/models/interns1.py +12 -47
  139. sglang/srt/models/internvl.py +6 -51
  140. sglang/srt/models/llama4.py +0 -2
  141. sglang/srt/models/minicpm3.py +0 -1
  142. sglang/srt/models/mixtral.py +0 -2
  143. sglang/srt/models/nemotron_nas.py +435 -0
  144. sglang/srt/models/olmoe.py +0 -1
  145. sglang/srt/models/phi4mm.py +3 -21
  146. sglang/srt/models/qwen2_5_vl.py +2 -0
  147. sglang/srt/models/qwen2_moe.py +3 -18
  148. sglang/srt/models/qwen3.py +2 -2
  149. sglang/srt/models/qwen3_classification.py +7 -1
  150. sglang/srt/models/qwen3_moe.py +9 -38
  151. sglang/srt/models/step3_vl.py +2 -1
  152. sglang/srt/models/xverse_moe.py +11 -5
  153. sglang/srt/multimodal/processors/base_processor.py +3 -3
  154. sglang/srt/multimodal/processors/internvl.py +7 -2
  155. sglang/srt/multimodal/processors/llava.py +11 -7
  156. sglang/srt/offloader.py +433 -0
  157. sglang/srt/operations.py +6 -1
  158. sglang/srt/reasoning_parser.py +4 -3
  159. sglang/srt/server_args.py +237 -104
  160. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
  161. sglang/srt/speculative/eagle_utils.py +36 -13
  162. sglang/srt/speculative/eagle_worker.py +56 -3
  163. sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
  164. sglang/srt/two_batch_overlap.py +16 -11
  165. sglang/srt/utils.py +68 -70
  166. sglang/test/runners.py +8 -5
  167. sglang/test/test_block_fp8.py +5 -6
  168. sglang/test/test_block_fp8_ep.py +13 -19
  169. sglang/test/test_cutlass_moe.py +4 -6
  170. sglang/test/test_cutlass_w4a8_moe.py +4 -3
  171. sglang/test/test_fp4_moe.py +4 -3
  172. sglang/test/test_utils.py +7 -0
  173. sglang/utils.py +0 -1
  174. sglang/version.py +1 -1
  175. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/METADATA +7 -7
  176. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/RECORD +179 -161
  177. sglang/srt/layers/quantization/fp4.py +0 -557
  178. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/WHEEL +0 -0
  179. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/licenses/LICENSE +0 -0
  180. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/top_level.txt +0 -0
@@ -2,20 +2,26 @@ from sglang.srt.layers.moe.token_dispatcher.base_dispatcher import (
2
2
  BaseDispatcher,
3
3
  BaseDispatcherConfig,
4
4
  DispatchOutput,
5
+ DispatchOutputChecker,
5
6
  DispatchOutputFormat,
6
7
  )
7
8
  from sglang.srt.layers.moe.token_dispatcher.deepep import (
9
+ AscendDeepEPLLOutput,
8
10
  DeepEPConfig,
9
11
  DeepEPDispatcher,
10
12
  DeepEPLLOutput,
11
13
  DeepEPNormalOutput,
12
14
  )
15
+ from sglang.srt.layers.moe.token_dispatcher.standard import StandardDispatchOutput
13
16
 
14
17
  __all__ = [
18
+ "AscendDeepEPLLOutput",
15
19
  "BaseDispatcher",
16
20
  "BaseDispatcherConfig",
17
21
  "DispatchOutput",
18
22
  "DispatchOutputFormat",
23
+ "DispatchOutputChecker",
24
+ "StandardDispatchOutput",
19
25
  "DeepEPConfig",
20
26
  "DeepEPDispatcher",
21
27
  "DeepEPNormalOutput",
@@ -2,35 +2,76 @@ from __future__ import annotations
2
2
 
3
3
  from abc import ABC, abstractmethod
4
4
  from enum import Enum, auto
5
- from typing import Protocol, runtime_checkable
5
+ from typing import TYPE_CHECKING, Protocol, TypeGuard, Union, runtime_checkable
6
6
 
7
7
  import torch
8
8
 
9
+ if TYPE_CHECKING:
10
+ from sglang.srt.layers.moe.token_dispatcher import (
11
+ AscendDeepEPLLOutput,
12
+ DeepEPLLOutput,
13
+ DeepEPNormalOutput,
14
+ StandardDispatchOutput,
15
+ )
9
16
 
10
- class MoEA2ABackend(Enum):
11
- none = "none"
12
- deepep = "deepep"
13
17
 
14
- def is_none(self):
15
- return self == MoEA2ABackend.none
18
+ class DispatchOutputChecker:
16
19
 
17
- def is_deepep(self):
18
- return self == MoEA2ABackend.deepep
20
+ @staticmethod
21
+ def format_is_standard(
22
+ dispatch_output: DispatchOutput,
23
+ ) -> TypeGuard[StandardDispatchOutput]:
24
+ return dispatch_output.format.is_standard()
25
+
26
+ @staticmethod
27
+ def format_is_deepep_normal(
28
+ dispatch_output: DispatchOutput,
29
+ ) -> TypeGuard[DeepEPNormalOutput]:
30
+ return dispatch_output.format.is_deepep_normal()
31
+
32
+ @staticmethod
33
+ def format_is_deepep_ll(
34
+ dispatch_output: DispatchOutput,
35
+ ) -> TypeGuard[DeepEPLLOutput]:
36
+ return dispatch_output.format.is_deepep_ll()
37
+
38
+ @staticmethod
39
+ def format_is_deepep(
40
+ dispatch_output: DispatchOutput,
41
+ ) -> TypeGuard[Union[DeepEPNormalOutput, DeepEPLLOutput]]:
42
+ return dispatch_output.format.is_deepep()
43
+
44
+ @staticmethod
45
+ def format_is_ascent_ll(
46
+ dispatch_output: DispatchOutput,
47
+ ) -> TypeGuard[AscendDeepEPLLOutput]:
48
+ return dispatch_output.format.is_ascent_ll()
19
49
 
20
50
 
21
51
  class DispatchOutputFormat(Enum):
22
- standard = auto()
23
- deepep_normal = auto()
24
- deepep_ll = auto()
52
+
53
+ STANDARD = auto()
54
+ DEEPEP_NORMAL = auto()
55
+ DEEPEP_LL = auto()
56
+ ASCENT_LL = auto()
25
57
 
26
58
  def is_standard(self) -> bool:
27
- return self == DispatchOutputFormat.standard
59
+ return self == DispatchOutputFormat.STANDARD
28
60
 
29
61
  def is_deepep_normal(self) -> bool:
30
- return self == DispatchOutputFormat.deepep_normal
62
+ return self == DispatchOutputFormat.DEEPEP_NORMAL
31
63
 
32
64
  def is_deepep_ll(self) -> bool:
33
- return self == DispatchOutputFormat.deepep_ll
65
+ return self == DispatchOutputFormat.DEEPEP_LL
66
+
67
+ def is_deepep(self) -> bool:
68
+ return self in [
69
+ DispatchOutputFormat.DEEPEP_NORMAL,
70
+ DispatchOutputFormat.DEEPEP_LL,
71
+ ]
72
+
73
+ def is_ascent_ll(self) -> bool:
74
+ return self == DispatchOutputFormat.ASCENT_LL
34
75
 
35
76
 
36
77
  @runtime_checkable
@@ -2,27 +2,17 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  from dataclasses import dataclass
5
- from typing import (
6
- TYPE_CHECKING,
7
- List,
8
- NamedTuple,
9
- Optional,
10
- Protocol,
11
- Tuple,
12
- Union,
13
- runtime_checkable,
14
- )
5
+ from typing import TYPE_CHECKING, List, NamedTuple, Optional, Tuple, Union
15
6
 
16
7
  from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
8
+ from sglang.srt.layers.moe import DeepEPMode, get_deepep_config, is_tbo_enabled
17
9
  from sglang.srt.layers.moe.token_dispatcher.base_dispatcher import (
18
10
  BaseDispatcher,
19
11
  BaseDispatcherConfig,
20
12
  DispatchOutput,
21
13
  DispatchOutputFormat,
22
14
  )
23
- from sglang.srt.layers.moe.utils import DeepEPMode
24
15
  from sglang.srt.layers.quantization import deep_gemm_wrapper
25
- from sglang.srt.managers.schedule_batch import global_server_args_dict
26
16
  from sglang.srt.utils import (
27
17
  get_bool_env_var,
28
18
  get_int_env_var,
@@ -72,7 +62,7 @@ class DeepEPNormalOutput(NamedTuple):
72
62
 
73
63
  @property
74
64
  def format(self) -> DispatchOutputFormat:
75
- return DispatchOutputFormat.deepep_normal
65
+ return DispatchOutputFormat.DEEPEP_NORMAL
76
66
 
77
67
 
78
68
  class DeepEPLLOutput(NamedTuple):
@@ -86,7 +76,7 @@ class DeepEPLLOutput(NamedTuple):
86
76
 
87
77
  @property
88
78
  def format(self) -> DispatchOutputFormat:
89
- return DispatchOutputFormat.deepep_ll
79
+ return DispatchOutputFormat.DEEPEP_LL
90
80
 
91
81
 
92
82
  class AscendDeepEPLLOutput(NamedTuple):
@@ -101,7 +91,7 @@ class AscendDeepEPLLOutput(NamedTuple):
101
91
 
102
92
  @property
103
93
  def format(self) -> DispatchOutputFormat:
104
- return DispatchOutputFormat.deepep_ll
94
+ return DispatchOutputFormat.ASCENT_LL
105
95
 
106
96
 
107
97
  assert isinstance(DeepEPNormalOutput, DispatchOutput)
@@ -128,8 +118,8 @@ class DeepEPBuffer:
128
118
  hidden_size: int,
129
119
  param_bytes: int,
130
120
  deepep_mode: DeepEPMode,
131
- num_max_dispatch_tokens_per_rank: int = None,
132
- num_experts: int = None,
121
+ num_max_dispatch_tokens_per_rank: int = -1,
122
+ num_experts: int = -1,
133
123
  ):
134
124
  if cls._buffer is not None:
135
125
  return cls._buffer
@@ -156,8 +146,8 @@ class DeepEPBuffer:
156
146
  num_rdma_bytes,
157
147
  )
158
148
  if deepep_mode.enable_low_latency():
159
- assert num_max_dispatch_tokens_per_rank is not None
160
- assert num_experts is not None and num_experts % group.size() == 0
149
+ assert num_max_dispatch_tokens_per_rank != -1
150
+ assert num_experts != -1 and num_experts % group.size() == 0
161
151
  num_rdma_bytes = max(
162
152
  Buffer.get_low_latency_rdma_size_hint(
163
153
  num_max_dispatch_tokens_per_rank,
@@ -181,7 +171,7 @@ class DeepEPBuffer:
181
171
  ).multi_processor_count
182
172
  if (
183
173
  (deepep_mode != DeepEPMode.LOW_LATENCY)
184
- and not global_server_args_dict["enable_two_batch_overlap"]
174
+ and not is_tbo_enabled()
185
175
  and (DeepEPConfig.get_instance().num_sms < total_num_sms // 2)
186
176
  ):
187
177
  logger.warning(
@@ -226,7 +216,7 @@ class DeepEPConfig(BaseDispatcherConfig):
226
216
  _instance = None
227
217
 
228
218
  def __init__(self):
229
- config_str = global_server_args_dict["deepep_config"]
219
+ config_str = get_deepep_config()
230
220
  if config_str:
231
221
  config_parsed = load_json_config(config_str)
232
222
  if torch.distributed.get_rank() == 0:
@@ -13,7 +13,7 @@ class StandardDispatchOutput(NamedTuple):
13
13
 
14
14
  @property
15
15
  def format(self) -> DispatchOutputFormat:
16
- return DispatchOutputFormat.standard
16
+ return DispatchOutputFormat.STANDARD
17
17
 
18
18
 
19
19
  assert isinstance(StandardDispatchOutput, DispatchOutput)