sglang 0.5.0rc2__py3-none-any.whl → 0.5.1.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +0 -6
- sglang/bench_one_batch_server.py +7 -2
- sglang/bench_serving.py +3 -3
- sglang/eval/llama3_eval.py +0 -1
- sglang/srt/configs/model_config.py +24 -9
- sglang/srt/configs/update_config.py +40 -5
- sglang/srt/constrained/xgrammar_backend.py +23 -11
- sglang/srt/conversation.py +2 -15
- sglang/srt/disaggregation/ascend/conn.py +1 -3
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +1 -1
- sglang/srt/disaggregation/launch_lb.py +7 -1
- sglang/srt/disaggregation/mini_lb.py +11 -5
- sglang/srt/disaggregation/mooncake/conn.py +141 -47
- sglang/srt/disaggregation/prefill.py +261 -5
- sglang/srt/disaggregation/utils.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
- sglang/srt/distributed/device_communicators/pynccl.py +68 -18
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
- sglang/srt/distributed/naive_distributed.py +112 -0
- sglang/srt/distributed/parallel_state.py +90 -4
- sglang/srt/entrypoints/context.py +20 -1
- sglang/srt/entrypoints/engine.py +27 -2
- sglang/srt/entrypoints/http_server.py +12 -0
- sglang/srt/entrypoints/openai/protocol.py +2 -2
- sglang/srt/entrypoints/openai/serving_chat.py +22 -6
- sglang/srt/entrypoints/openai/serving_completions.py +9 -1
- sglang/srt/entrypoints/openai/serving_responses.py +2 -2
- sglang/srt/eplb/expert_distribution.py +2 -3
- sglang/srt/function_call/deepseekv3_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +24 -0
- sglang/srt/host_shared_memory.py +83 -0
- sglang/srt/layers/attention/ascend_backend.py +132 -22
- sglang/srt/layers/attention/flashattention_backend.py +24 -17
- sglang/srt/layers/attention/flashinfer_backend.py +11 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
- sglang/srt/layers/attention/triton_backend.py +85 -46
- sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
- sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
- sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
- sglang/srt/layers/attention/utils.py +94 -15
- sglang/srt/layers/attention/vision.py +40 -13
- sglang/srt/layers/attention/vision_utils.py +65 -0
- sglang/srt/layers/communicator.py +51 -3
- sglang/srt/layers/dp_attention.py +23 -4
- sglang/srt/layers/elementwise.py +94 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
- sglang/srt/layers/layernorm.py +8 -1
- sglang/srt/layers/linear.py +24 -0
- sglang/srt/layers/logits_processor.py +5 -1
- sglang/srt/layers/moe/__init__.py +31 -0
- sglang/srt/layers/moe/ep_moe/layer.py +37 -33
- sglang/srt/layers/moe/fused_moe_native.py +14 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
- sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
- sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
- sglang/srt/layers/moe/moe_runner/base.py +13 -0
- sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
- sglang/srt/layers/moe/router.py +15 -9
- sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
- sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
- sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
- sglang/srt/layers/moe/topk.py +167 -83
- sglang/srt/layers/moe/utils.py +159 -18
- sglang/srt/layers/quantization/__init__.py +13 -14
- sglang/srt/layers/quantization/awq.py +7 -7
- sglang/srt/layers/quantization/base_config.py +2 -6
- sglang/srt/layers/quantization/blockwise_int8.py +4 -12
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -0
- sglang/srt/layers/quantization/fp8.py +127 -119
- sglang/srt/layers/quantization/fp8_kernel.py +195 -24
- sglang/srt/layers/quantization/fp8_utils.py +34 -9
- sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
- sglang/srt/layers/quantization/gptq.py +5 -4
- sglang/srt/layers/quantization/marlin_utils.py +11 -3
- sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
- sglang/srt/layers/quantization/modelopt_quant.py +165 -68
- sglang/srt/layers/quantization/moe_wna16.py +10 -15
- sglang/srt/layers/quantization/mxfp4.py +206 -37
- sglang/srt/layers/quantization/quark/quark.py +390 -0
- sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
- sglang/srt/layers/quantization/unquant.py +34 -70
- sglang/srt/layers/quantization/utils.py +25 -0
- sglang/srt/layers/quantization/w4afp8.py +7 -8
- sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
- sglang/srt/layers/quantization/w8a8_int8.py +5 -13
- sglang/srt/layers/radix_attention.py +6 -0
- sglang/srt/layers/rotary_embedding.py +1 -0
- sglang/srt/lora/lora_manager.py +21 -22
- sglang/srt/lora/lora_registry.py +3 -3
- sglang/srt/lora/mem_pool.py +26 -24
- sglang/srt/lora/utils.py +10 -12
- sglang/srt/managers/cache_controller.py +76 -18
- sglang/srt/managers/detokenizer_manager.py +10 -2
- sglang/srt/managers/io_struct.py +9 -0
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/schedule_batch.py +4 -9
- sglang/srt/managers/scheduler.py +25 -16
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/template_manager.py +7 -5
- sglang/srt/managers/tokenizer_manager.py +60 -21
- sglang/srt/managers/tp_worker.py +1 -0
- sglang/srt/managers/utils.py +59 -1
- sglang/srt/mem_cache/allocator.py +7 -5
- sglang/srt/mem_cache/allocator_ascend.py +0 -11
- sglang/srt/mem_cache/hicache_storage.py +14 -4
- sglang/srt/mem_cache/memory_pool.py +3 -3
- sglang/srt/mem_cache/memory_pool_host.py +35 -2
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
- sglang/srt/model_executor/cuda_graph_runner.py +25 -12
- sglang/srt/model_executor/forward_batch_info.py +4 -1
- sglang/srt/model_executor/model_runner.py +43 -32
- sglang/srt/model_executor/npu_graph_runner.py +94 -0
- sglang/srt/model_loader/loader.py +24 -6
- sglang/srt/models/dbrx.py +12 -6
- sglang/srt/models/deepseek.py +2 -1
- sglang/srt/models/deepseek_nextn.py +3 -1
- sglang/srt/models/deepseek_v2.py +224 -223
- sglang/srt/models/ernie4.py +2 -2
- sglang/srt/models/glm4_moe.py +25 -63
- sglang/srt/models/glm4v.py +52 -1
- sglang/srt/models/glm4v_moe.py +8 -11
- sglang/srt/models/gpt_oss.py +34 -74
- sglang/srt/models/granitemoe.py +0 -1
- sglang/srt/models/grok.py +375 -51
- sglang/srt/models/interns1.py +12 -47
- sglang/srt/models/internvl.py +6 -51
- sglang/srt/models/llama4.py +0 -2
- sglang/srt/models/minicpm3.py +0 -1
- sglang/srt/models/mixtral.py +0 -2
- sglang/srt/models/nemotron_nas.py +435 -0
- sglang/srt/models/olmoe.py +0 -1
- sglang/srt/models/phi4mm.py +3 -21
- sglang/srt/models/qwen2_5_vl.py +2 -0
- sglang/srt/models/qwen2_moe.py +3 -18
- sglang/srt/models/qwen3.py +2 -2
- sglang/srt/models/qwen3_classification.py +7 -1
- sglang/srt/models/qwen3_moe.py +9 -38
- sglang/srt/models/step3_vl.py +2 -1
- sglang/srt/models/xverse_moe.py +11 -5
- sglang/srt/multimodal/processors/base_processor.py +3 -3
- sglang/srt/multimodal/processors/internvl.py +7 -2
- sglang/srt/multimodal/processors/llava.py +11 -7
- sglang/srt/offloader.py +433 -0
- sglang/srt/operations.py +6 -1
- sglang/srt/reasoning_parser.py +4 -3
- sglang/srt/server_args.py +237 -104
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
- sglang/srt/speculative/eagle_utils.py +36 -13
- sglang/srt/speculative/eagle_worker.py +56 -3
- sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
- sglang/srt/two_batch_overlap.py +16 -11
- sglang/srt/utils.py +68 -70
- sglang/test/runners.py +8 -5
- sglang/test/test_block_fp8.py +5 -6
- sglang/test/test_block_fp8_ep.py +13 -19
- sglang/test/test_cutlass_moe.py +4 -6
- sglang/test/test_cutlass_w4a8_moe.py +4 -3
- sglang/test/test_fp4_moe.py +4 -3
- sglang/test/test_utils.py +7 -0
- sglang/utils.py +0 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/METADATA +7 -7
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/RECORD +179 -161
- sglang/srt/layers/quantization/fp4.py +0 -557
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/top_level.txt +0 -0
@@ -2,20 +2,26 @@ from sglang.srt.layers.moe.token_dispatcher.base_dispatcher import (
|
|
2
2
|
BaseDispatcher,
|
3
3
|
BaseDispatcherConfig,
|
4
4
|
DispatchOutput,
|
5
|
+
DispatchOutputChecker,
|
5
6
|
DispatchOutputFormat,
|
6
7
|
)
|
7
8
|
from sglang.srt.layers.moe.token_dispatcher.deepep import (
|
9
|
+
AscendDeepEPLLOutput,
|
8
10
|
DeepEPConfig,
|
9
11
|
DeepEPDispatcher,
|
10
12
|
DeepEPLLOutput,
|
11
13
|
DeepEPNormalOutput,
|
12
14
|
)
|
15
|
+
from sglang.srt.layers.moe.token_dispatcher.standard import StandardDispatchOutput
|
13
16
|
|
14
17
|
__all__ = [
|
18
|
+
"AscendDeepEPLLOutput",
|
15
19
|
"BaseDispatcher",
|
16
20
|
"BaseDispatcherConfig",
|
17
21
|
"DispatchOutput",
|
18
22
|
"DispatchOutputFormat",
|
23
|
+
"DispatchOutputChecker",
|
24
|
+
"StandardDispatchOutput",
|
19
25
|
"DeepEPConfig",
|
20
26
|
"DeepEPDispatcher",
|
21
27
|
"DeepEPNormalOutput",
|
@@ -2,35 +2,76 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from abc import ABC, abstractmethod
|
4
4
|
from enum import Enum, auto
|
5
|
-
from typing import Protocol, runtime_checkable
|
5
|
+
from typing import TYPE_CHECKING, Protocol, TypeGuard, Union, runtime_checkable
|
6
6
|
|
7
7
|
import torch
|
8
8
|
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from sglang.srt.layers.moe.token_dispatcher import (
|
11
|
+
AscendDeepEPLLOutput,
|
12
|
+
DeepEPLLOutput,
|
13
|
+
DeepEPNormalOutput,
|
14
|
+
StandardDispatchOutput,
|
15
|
+
)
|
9
16
|
|
10
|
-
class MoEA2ABackend(Enum):
|
11
|
-
none = "none"
|
12
|
-
deepep = "deepep"
|
13
17
|
|
14
|
-
|
15
|
-
return self == MoEA2ABackend.none
|
18
|
+
class DispatchOutputChecker:
|
16
19
|
|
17
|
-
|
18
|
-
|
20
|
+
@staticmethod
|
21
|
+
def format_is_standard(
|
22
|
+
dispatch_output: DispatchOutput,
|
23
|
+
) -> TypeGuard[StandardDispatchOutput]:
|
24
|
+
return dispatch_output.format.is_standard()
|
25
|
+
|
26
|
+
@staticmethod
|
27
|
+
def format_is_deepep_normal(
|
28
|
+
dispatch_output: DispatchOutput,
|
29
|
+
) -> TypeGuard[DeepEPNormalOutput]:
|
30
|
+
return dispatch_output.format.is_deepep_normal()
|
31
|
+
|
32
|
+
@staticmethod
|
33
|
+
def format_is_deepep_ll(
|
34
|
+
dispatch_output: DispatchOutput,
|
35
|
+
) -> TypeGuard[DeepEPLLOutput]:
|
36
|
+
return dispatch_output.format.is_deepep_ll()
|
37
|
+
|
38
|
+
@staticmethod
|
39
|
+
def format_is_deepep(
|
40
|
+
dispatch_output: DispatchOutput,
|
41
|
+
) -> TypeGuard[Union[DeepEPNormalOutput, DeepEPLLOutput]]:
|
42
|
+
return dispatch_output.format.is_deepep()
|
43
|
+
|
44
|
+
@staticmethod
|
45
|
+
def format_is_ascent_ll(
|
46
|
+
dispatch_output: DispatchOutput,
|
47
|
+
) -> TypeGuard[AscendDeepEPLLOutput]:
|
48
|
+
return dispatch_output.format.is_ascent_ll()
|
19
49
|
|
20
50
|
|
21
51
|
class DispatchOutputFormat(Enum):
|
22
|
-
|
23
|
-
|
24
|
-
|
52
|
+
|
53
|
+
STANDARD = auto()
|
54
|
+
DEEPEP_NORMAL = auto()
|
55
|
+
DEEPEP_LL = auto()
|
56
|
+
ASCENT_LL = auto()
|
25
57
|
|
26
58
|
def is_standard(self) -> bool:
|
27
|
-
return self == DispatchOutputFormat.
|
59
|
+
return self == DispatchOutputFormat.STANDARD
|
28
60
|
|
29
61
|
def is_deepep_normal(self) -> bool:
|
30
|
-
return self == DispatchOutputFormat.
|
62
|
+
return self == DispatchOutputFormat.DEEPEP_NORMAL
|
31
63
|
|
32
64
|
def is_deepep_ll(self) -> bool:
|
33
|
-
return self == DispatchOutputFormat.
|
65
|
+
return self == DispatchOutputFormat.DEEPEP_LL
|
66
|
+
|
67
|
+
def is_deepep(self) -> bool:
|
68
|
+
return self in [
|
69
|
+
DispatchOutputFormat.DEEPEP_NORMAL,
|
70
|
+
DispatchOutputFormat.DEEPEP_LL,
|
71
|
+
]
|
72
|
+
|
73
|
+
def is_ascent_ll(self) -> bool:
|
74
|
+
return self == DispatchOutputFormat.ASCENT_LL
|
34
75
|
|
35
76
|
|
36
77
|
@runtime_checkable
|
@@ -2,27 +2,17 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import logging
|
4
4
|
from dataclasses import dataclass
|
5
|
-
from typing import
|
6
|
-
TYPE_CHECKING,
|
7
|
-
List,
|
8
|
-
NamedTuple,
|
9
|
-
Optional,
|
10
|
-
Protocol,
|
11
|
-
Tuple,
|
12
|
-
Union,
|
13
|
-
runtime_checkable,
|
14
|
-
)
|
5
|
+
from typing import TYPE_CHECKING, List, NamedTuple, Optional, Tuple, Union
|
15
6
|
|
16
7
|
from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
|
8
|
+
from sglang.srt.layers.moe import DeepEPMode, get_deepep_config, is_tbo_enabled
|
17
9
|
from sglang.srt.layers.moe.token_dispatcher.base_dispatcher import (
|
18
10
|
BaseDispatcher,
|
19
11
|
BaseDispatcherConfig,
|
20
12
|
DispatchOutput,
|
21
13
|
DispatchOutputFormat,
|
22
14
|
)
|
23
|
-
from sglang.srt.layers.moe.utils import DeepEPMode
|
24
15
|
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
25
|
-
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
26
16
|
from sglang.srt.utils import (
|
27
17
|
get_bool_env_var,
|
28
18
|
get_int_env_var,
|
@@ -72,7 +62,7 @@ class DeepEPNormalOutput(NamedTuple):
|
|
72
62
|
|
73
63
|
@property
|
74
64
|
def format(self) -> DispatchOutputFormat:
|
75
|
-
return DispatchOutputFormat.
|
65
|
+
return DispatchOutputFormat.DEEPEP_NORMAL
|
76
66
|
|
77
67
|
|
78
68
|
class DeepEPLLOutput(NamedTuple):
|
@@ -86,7 +76,7 @@ class DeepEPLLOutput(NamedTuple):
|
|
86
76
|
|
87
77
|
@property
|
88
78
|
def format(self) -> DispatchOutputFormat:
|
89
|
-
return DispatchOutputFormat.
|
79
|
+
return DispatchOutputFormat.DEEPEP_LL
|
90
80
|
|
91
81
|
|
92
82
|
class AscendDeepEPLLOutput(NamedTuple):
|
@@ -101,7 +91,7 @@ class AscendDeepEPLLOutput(NamedTuple):
|
|
101
91
|
|
102
92
|
@property
|
103
93
|
def format(self) -> DispatchOutputFormat:
|
104
|
-
return DispatchOutputFormat.
|
94
|
+
return DispatchOutputFormat.ASCENT_LL
|
105
95
|
|
106
96
|
|
107
97
|
assert isinstance(DeepEPNormalOutput, DispatchOutput)
|
@@ -128,8 +118,8 @@ class DeepEPBuffer:
|
|
128
118
|
hidden_size: int,
|
129
119
|
param_bytes: int,
|
130
120
|
deepep_mode: DeepEPMode,
|
131
|
-
num_max_dispatch_tokens_per_rank: int =
|
132
|
-
num_experts: int =
|
121
|
+
num_max_dispatch_tokens_per_rank: int = -1,
|
122
|
+
num_experts: int = -1,
|
133
123
|
):
|
134
124
|
if cls._buffer is not None:
|
135
125
|
return cls._buffer
|
@@ -156,8 +146,8 @@ class DeepEPBuffer:
|
|
156
146
|
num_rdma_bytes,
|
157
147
|
)
|
158
148
|
if deepep_mode.enable_low_latency():
|
159
|
-
assert num_max_dispatch_tokens_per_rank
|
160
|
-
assert num_experts
|
149
|
+
assert num_max_dispatch_tokens_per_rank != -1
|
150
|
+
assert num_experts != -1 and num_experts % group.size() == 0
|
161
151
|
num_rdma_bytes = max(
|
162
152
|
Buffer.get_low_latency_rdma_size_hint(
|
163
153
|
num_max_dispatch_tokens_per_rank,
|
@@ -181,7 +171,7 @@ class DeepEPBuffer:
|
|
181
171
|
).multi_processor_count
|
182
172
|
if (
|
183
173
|
(deepep_mode != DeepEPMode.LOW_LATENCY)
|
184
|
-
and not
|
174
|
+
and not is_tbo_enabled()
|
185
175
|
and (DeepEPConfig.get_instance().num_sms < total_num_sms // 2)
|
186
176
|
):
|
187
177
|
logger.warning(
|
@@ -226,7 +216,7 @@ class DeepEPConfig(BaseDispatcherConfig):
|
|
226
216
|
_instance = None
|
227
217
|
|
228
218
|
def __init__(self):
|
229
|
-
config_str =
|
219
|
+
config_str = get_deepep_config()
|
230
220
|
if config_str:
|
231
221
|
config_parsed = load_json_config(config_str)
|
232
222
|
if torch.distributed.get_rank() == 0:
|