sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +79 -53
- sglang/bench_serving.py +186 -14
- sglang/profiler.py +0 -1
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +12 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/conversation.py +38 -5
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/launch_lb.py +0 -13
- sglang/srt/disaggregation/mini_lb.py +33 -8
- sglang/srt/disaggregation/prefill.py +1 -1
- sglang/srt/distributed/parallel_state.py +24 -14
- sglang/srt/entrypoints/engine.py +19 -12
- sglang/srt/entrypoints/http_server.py +174 -34
- sglang/srt/entrypoints/openai/protocol.py +87 -24
- sglang/srt/entrypoints/openai/serving_chat.py +50 -9
- sglang/srt/entrypoints/openai/serving_completions.py +15 -0
- sglang/srt/eplb/eplb_manager.py +26 -2
- sglang/srt/eplb/expert_distribution.py +29 -2
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/harmony_parser.py +588 -0
- sglang/srt/hf_transformers_utils.py +26 -7
- sglang/srt/layers/activation.py +12 -0
- sglang/srt/layers/attention/ascend_backend.py +374 -136
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
- sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
- sglang/srt/layers/communicator.py +1 -2
- sglang/srt/layers/layernorm.py +28 -3
- sglang/srt/layers/linear.py +3 -2
- sglang/srt/layers/logits_processor.py +1 -1
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +13 -13
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/topk.py +35 -12
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
- sglang/srt/layers/quantization/fp8.py +2 -1
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/modelopt_quant.py +7 -0
- sglang/srt/layers/quantization/mxfp4.py +25 -27
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w8a8_int8.py +7 -3
- sglang/srt/layers/rotary_embedding.py +28 -1
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/managers/cache_controller.py +237 -204
- sglang/srt/managers/detokenizer_manager.py +48 -2
- sglang/srt/managers/io_struct.py +57 -0
- sglang/srt/managers/mm_utils.py +5 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
- sglang/srt/managers/scheduler.py +94 -9
- sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/tokenizer_manager.py +122 -42
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +51 -23
- sglang/srt/mem_cache/hiradix_cache.py +87 -71
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +77 -14
- sglang/srt/mem_cache/memory_pool_host.py +4 -5
- sglang/srt/mem_cache/radix_cache.py +6 -4
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
- sglang/srt/mem_cache/swa_radix_cache.py +1 -1
- sglang/srt/model_executor/model_runner.py +6 -5
- sglang/srt/model_loader/loader.py +15 -24
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/models/deepseek_v2.py +38 -13
- sglang/srt/models/gpt_oss.py +2 -15
- sglang/srt/models/llama_eagle3.py +4 -0
- sglang/srt/models/longcat_flash.py +1015 -0
- sglang/srt/models/longcat_flash_nextn.py +691 -0
- sglang/srt/models/qwen2.py +26 -3
- sglang/srt/models/qwen2_5_vl.py +66 -41
- sglang/srt/models/qwen2_moe.py +22 -2
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/reasoning_parser.py +56 -300
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/server_args.py +122 -56
- sglang/srt/speculative/eagle_worker.py +28 -8
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +73 -5
- sglang/test/attention/test_trtllm_mla_backend.py +12 -3
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,8 @@
|
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
15
|
|
16
|
+
from typing import Optional
|
17
|
+
|
16
18
|
import torch
|
17
19
|
|
18
20
|
|
@@ -24,7 +26,7 @@ class MXFP4QuantizeUtil:
|
|
24
26
|
E2M1_bounds = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5])
|
25
27
|
|
26
28
|
@classmethod
|
27
|
-
def quantize(cls, input: torch.Tensor, block_size: int
|
29
|
+
def quantize(cls, input: torch.Tensor, block_size: Optional[int]) -> tuple:
|
28
30
|
"""Converting a tensor to a quantized format based on MXFP4 quantization. Only E4M3 is supported.
|
29
31
|
Args:
|
30
32
|
input (torch.Tensor): The input tensor to be quantized.
|
@@ -77,6 +77,19 @@ def is_layer_skipped(
|
|
77
77
|
)
|
78
78
|
else:
|
79
79
|
is_skipped = prefix in ignored_layers
|
80
|
+
if "gate_up_proj" in prefix:
|
81
|
+
prefix_gate = prefix.replace("gate_up_proj", "gate_proj")
|
82
|
+
prefix_up = prefix.replace("gate_up_proj", "up_proj")
|
83
|
+
if prefix_gate in ignored_layers and prefix_up in ignored_layers:
|
84
|
+
is_skipped = True
|
85
|
+
elif "experts" in prefix:
|
86
|
+
is_skipped = any(
|
87
|
+
[
|
88
|
+
prefix in layer_name
|
89
|
+
for layer_name in ignored_layers
|
90
|
+
if "experts" in layer_name
|
91
|
+
]
|
92
|
+
)
|
80
93
|
|
81
94
|
assert is_skipped is not None
|
82
95
|
return is_skipped
|
@@ -551,7 +551,7 @@ class NPU_W8A8LinearMethodImpl:
|
|
551
551
|
def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]:
|
552
552
|
params_dict = {}
|
553
553
|
params_dict["input_scale"] = torch.empty(1, dtype=params_dtype)
|
554
|
-
params_dict["input_offset"] = torch.empty(1, dtype=
|
554
|
+
params_dict["input_offset"] = torch.empty(1, dtype=params_dtype)
|
555
555
|
return params_dict
|
556
556
|
|
557
557
|
@staticmethod
|
@@ -582,11 +582,11 @@ class NPU_W8A8LinearMethodImpl:
|
|
582
582
|
if original_dtype != torch.int8:
|
583
583
|
x = torch_npu.npu_quantize(
|
584
584
|
x,
|
585
|
-
layer.
|
585
|
+
layer.aclnn_input_scale_reciprocal,
|
586
586
|
layer.aclnn_input_offset,
|
587
587
|
torch.qint8,
|
588
588
|
-1,
|
589
|
-
|
589
|
+
False,
|
590
590
|
)
|
591
591
|
# Only fuse bias add into GEMM for rank 0 (this ensures that
|
592
592
|
# bias will not get added more than once in Attention TP>1 case)
|
@@ -608,6 +608,10 @@ class NPU_W8A8LinearMethodImpl:
|
|
608
608
|
layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
|
609
609
|
requires_grad=False,
|
610
610
|
)
|
611
|
+
layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
|
612
|
+
layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
|
613
|
+
requires_grad=False,
|
614
|
+
)
|
611
615
|
layer.aclnn_input_offset = torch.nn.Parameter(
|
612
616
|
layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
|
613
617
|
requires_grad=False,
|
@@ -1876,7 +1876,7 @@ def rotate_half(x):
|
|
1876
1876
|
return torch.cat((-x2, x1), dim=-1)
|
1877
1877
|
|
1878
1878
|
|
1879
|
-
def
|
1879
|
+
def apply_rotary_pos_emb_native(
|
1880
1880
|
q: torch.Tensor,
|
1881
1881
|
k: torch.Tensor,
|
1882
1882
|
cos: torch.Tensor,
|
@@ -1899,6 +1899,33 @@ def apply_rotary_pos_emb(
|
|
1899
1899
|
return q_embed, k_embed
|
1900
1900
|
|
1901
1901
|
|
1902
|
+
def apply_rotary_pos_emb_npu(
|
1903
|
+
q: torch.Tensor,
|
1904
|
+
k: torch.Tensor,
|
1905
|
+
cos: torch.Tensor,
|
1906
|
+
sin: torch.Tensor,
|
1907
|
+
unsqueeze_dim=1,
|
1908
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
1909
|
+
if q.shape[1] != 128:
|
1910
|
+
return apply_rotary_pos_emb_native(q, k, cos, sin, unsqueeze_dim)
|
1911
|
+
cos = cos.unsqueeze(unsqueeze_dim)
|
1912
|
+
cos = torch.transpose(cos, 1, 2)
|
1913
|
+
sin = sin.unsqueeze(unsqueeze_dim)
|
1914
|
+
sin = torch.transpose(sin, 1, 2)
|
1915
|
+
q = torch.transpose(q, 1, 2)
|
1916
|
+
k = torch.transpose(k, 1, 2)
|
1917
|
+
q_embed, k_embed = torch_npu.npu_apply_rotary_pos_emb(q, k, cos, sin)
|
1918
|
+
q_embed = torch.transpose(q_embed, 1, 2)
|
1919
|
+
k_embed = torch.transpose(k_embed, 1, 2)
|
1920
|
+
return q_embed, k_embed
|
1921
|
+
|
1922
|
+
|
1923
|
+
if _is_npu:
|
1924
|
+
apply_rotary_pos_emb = apply_rotary_pos_emb_npu
|
1925
|
+
else:
|
1926
|
+
apply_rotary_pos_emb = apply_rotary_pos_emb_native
|
1927
|
+
|
1928
|
+
|
1902
1929
|
def get_rope_cpu(
|
1903
1930
|
head_size: int,
|
1904
1931
|
rotary_dim: int,
|
sglang/srt/layers/sampler.py
CHANGED
@@ -27,6 +27,7 @@ if is_cuda():
|
|
27
27
|
logger = logging.getLogger(__name__)
|
28
28
|
|
29
29
|
SYNC_TOKEN_IDS_ACROSS_TP = get_bool_env_var("SYNC_TOKEN_IDS_ACROSS_TP")
|
30
|
+
RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB")
|
30
31
|
|
31
32
|
|
32
33
|
class Sampler(nn.Module):
|
@@ -77,7 +78,12 @@ class Sampler(nn.Module):
|
|
77
78
|
batch_next_token_ids = torch.argmax(logits, -1)
|
78
79
|
if return_logprob:
|
79
80
|
logprobs = torch.nn.functional.log_softmax(logits, dim=-1)
|
81
|
+
|
80
82
|
else:
|
83
|
+
# Post process original logits. if temperatures are all 1.0, no need to rescale
|
84
|
+
if return_logprob and RETURN_ORIGINAL_LOGPROB:
|
85
|
+
logprobs = torch.softmax(logits, dim=-1)
|
86
|
+
|
81
87
|
# Post process logits
|
82
88
|
logits.div_(sampling_info.temperatures)
|
83
89
|
logits[:] = torch.softmax(logits, dim=-1)
|
@@ -116,7 +122,12 @@ class Sampler(nn.Module):
|
|
116
122
|
|
117
123
|
if return_logprob:
|
118
124
|
# clamp to avoid -inf
|
119
|
-
|
125
|
+
if RETURN_ORIGINAL_LOGPROB:
|
126
|
+
logprobs = torch.log(logprobs).clamp(
|
127
|
+
min=torch.finfo(logprobs.dtype).min
|
128
|
+
)
|
129
|
+
else:
|
130
|
+
logprobs = torch.log(probs).clamp(min=torch.finfo(probs.dtype).min)
|
120
131
|
|
121
132
|
# Attach logprobs to logits_output (in-place modification)
|
122
133
|
if return_logprob:
|
@@ -201,7 +212,10 @@ def top_p_normalize_probs_torch(
|
|
201
212
|
return torch.zeros_like(probs_sort).scatter_(-1, probs_idx, probs_sort)
|
202
213
|
|
203
214
|
|
204
|
-
def get_top_logprobs(
|
215
|
+
def get_top_logprobs(
|
216
|
+
logprobs: torch.Tensor,
|
217
|
+
top_logprobs_nums: List[int],
|
218
|
+
):
|
205
219
|
max_k = max(top_logprobs_nums)
|
206
220
|
ret = logprobs.topk(max_k, dim=1)
|
207
221
|
values = ret.values.tolist()
|
@@ -212,10 +226,17 @@ def get_top_logprobs(logprobs: torch.Tensor, top_logprobs_nums: List[int]):
|
|
212
226
|
for i, k in enumerate(top_logprobs_nums):
|
213
227
|
output_top_logprobs_val.append(values[i][:k])
|
214
228
|
output_top_logprobs_idx.append(indices[i][:k])
|
215
|
-
|
229
|
+
|
230
|
+
return (
|
231
|
+
output_top_logprobs_val,
|
232
|
+
output_top_logprobs_idx,
|
233
|
+
)
|
216
234
|
|
217
235
|
|
218
|
-
def get_token_ids_logprobs(
|
236
|
+
def get_token_ids_logprobs(
|
237
|
+
logprobs: torch.Tensor,
|
238
|
+
token_ids_logprobs: List[List[int]],
|
239
|
+
):
|
219
240
|
output_token_ids_logprobs_val = []
|
220
241
|
output_token_ids_logprobs_idx = []
|
221
242
|
for i, token_ids in enumerate(token_ids_logprobs):
|
@@ -226,7 +247,10 @@ def get_token_ids_logprobs(logprobs: torch.Tensor, token_ids_logprobs: List[List
|
|
226
247
|
output_token_ids_logprobs_val.append([])
|
227
248
|
output_token_ids_logprobs_idx.append([])
|
228
249
|
|
229
|
-
return
|
250
|
+
return (
|
251
|
+
output_token_ids_logprobs_val,
|
252
|
+
output_token_ids_logprobs_idx,
|
253
|
+
)
|
230
254
|
|
231
255
|
|
232
256
|
def apply_custom_logit_processor(
|
sglang/srt/layers/utils.py
CHANGED
@@ -34,17 +34,3 @@ class PPMissingLayer(torch.nn.Identity):
|
|
34
34
|
"""
|
35
35
|
input = args[0] if args else next(iter(kwargs.values()))
|
36
36
|
return (input,) if self.return_tuple else input
|
37
|
-
|
38
|
-
|
39
|
-
@lru_cache(maxsize=1)
|
40
|
-
def is_sm100_supported(device=None) -> bool:
|
41
|
-
return (torch.cuda.get_device_capability(device)[0] == 10) and (
|
42
|
-
torch.version.cuda >= "12.8"
|
43
|
-
)
|
44
|
-
|
45
|
-
|
46
|
-
@lru_cache(maxsize=1)
|
47
|
-
def is_sm90_supported(device=None) -> bool:
|
48
|
-
return (torch.cuda.get_device_capability(device)[0] == 9) and (
|
49
|
-
torch.version.cuda >= "12.3"
|
50
|
-
)
|