sglang 0.5.1.post3__py3-none-any.whl → 0.5.2rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +14 -1
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/launch_lb.py +0 -13
- sglang/srt/disaggregation/mini_lb.py +33 -8
- sglang/srt/disaggregation/prefill.py +1 -1
- sglang/srt/distributed/parallel_state.py +27 -15
- sglang/srt/entrypoints/engine.py +19 -12
- sglang/srt/entrypoints/http_server.py +174 -34
- sglang/srt/entrypoints/openai/protocol.py +60 -0
- sglang/srt/eplb/eplb_manager.py +26 -2
- sglang/srt/eplb/expert_distribution.py +29 -2
- sglang/srt/hf_transformers_utils.py +10 -0
- sglang/srt/layers/activation.py +12 -0
- sglang/srt/layers/attention/ascend_backend.py +240 -109
- sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
- sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
- sglang/srt/layers/layernorm.py +28 -3
- sglang/srt/layers/linear.py +3 -2
- sglang/srt/layers/logits_processor.py +1 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +14 -13
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
- sglang/srt/layers/moe/topk.py +35 -12
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
- sglang/srt/layers/quantization/modelopt_quant.py +7 -0
- sglang/srt/layers/quantization/mxfp4.py +9 -4
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w4afp8.py +30 -25
- sglang/srt/layers/quantization/w8a8_int8.py +7 -3
- sglang/srt/layers/rotary_embedding.py +28 -1
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/managers/cache_controller.py +62 -96
- sglang/srt/managers/detokenizer_manager.py +9 -2
- sglang/srt/managers/io_struct.py +27 -0
- sglang/srt/managers/mm_utils.py +5 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +629 -0
- sglang/srt/managers/scheduler.py +39 -2
- sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/tokenizer_manager.py +86 -39
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +20 -3
- sglang/srt/mem_cache/hiradix_cache.py +94 -71
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +4 -0
- sglang/srt/mem_cache/memory_pool_host.py +4 -4
- sglang/srt/mem_cache/radix_cache.py +5 -4
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -9
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +2 -1
- sglang/srt/mem_cache/swa_radix_cache.py +1 -1
- sglang/srt/model_executor/model_runner.py +5 -4
- sglang/srt/model_loader/loader.py +15 -24
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/models/deepseek_v2.py +31 -10
- sglang/srt/models/gpt_oss.py +5 -18
- sglang/srt/models/llama_eagle3.py +4 -0
- sglang/srt/models/longcat_flash.py +1026 -0
- sglang/srt/models/longcat_flash_nextn.py +699 -0
- sglang/srt/models/qwen2.py +26 -3
- sglang/srt/models/qwen2_5_vl.py +65 -41
- sglang/srt/models/qwen2_moe.py +22 -2
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/server_args.py +112 -55
- sglang/srt/speculative/eagle_worker.py +28 -8
- sglang/srt/utils.py +4 -0
- sglang/test/attention/test_trtllm_mla_backend.py +12 -3
- sglang/test/test_cutlass_w4a8_moe.py +24 -9
- sglang/version.py +1 -1
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/METADATA +5 -5
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/RECORD +93 -85
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/top_level.txt +0 -0
@@ -46,6 +46,7 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
|
46
46
|
from sglang.srt.utils import (
|
47
47
|
empty_context,
|
48
48
|
get_available_gpu_memory,
|
49
|
+
get_bool_env_var,
|
49
50
|
is_cuda,
|
50
51
|
next_power_of_2,
|
51
52
|
)
|
@@ -54,6 +55,7 @@ if is_cuda():
|
|
54
55
|
from sgl_kernel import segment_packbits
|
55
56
|
|
56
57
|
logger = logging.getLogger(__name__)
|
58
|
+
RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB")
|
57
59
|
|
58
60
|
|
59
61
|
@contextmanager
|
@@ -137,8 +139,15 @@ class EAGLEWorker(TpModelWorker):
|
|
137
139
|
embed, head = self.target_worker.model_runner.model.get_embed_and_head()
|
138
140
|
|
139
141
|
if self.speculative_algorithm.is_eagle3():
|
140
|
-
# EAGLE3 models don't share lm_head
|
141
|
-
|
142
|
+
# most cases EAGLE3 models don't share lm_head
|
143
|
+
# but some models (e.g. nvidia/gpt-oss-120b-Eagle3) shares
|
144
|
+
if (
|
145
|
+
hasattr(self.draft_model_runner.model, "load_lm_head_from_target")
|
146
|
+
and self.draft_model_runner.model.load_lm_head_from_target
|
147
|
+
):
|
148
|
+
self.draft_model_runner.model.set_embed_and_head(embed, head)
|
149
|
+
else:
|
150
|
+
self.draft_model_runner.model.set_embed(embed)
|
142
151
|
|
143
152
|
# grab hot token ids
|
144
153
|
if self.draft_model_runner.model.hot_token_id is not None:
|
@@ -781,15 +790,20 @@ class EAGLEWorker(TpModelWorker):
|
|
781
790
|
token_ids_logprobs = batch.token_ids_logprobs
|
782
791
|
accepted_indices = res.accepted_indices
|
783
792
|
assert len(accepted_indices) == len(logits_output.next_token_logits)
|
793
|
+
|
784
794
|
temperatures = batch.sampling_info.temperatures
|
785
795
|
num_draft_tokens = batch.spec_info.draft_token_num
|
786
796
|
# acceptance indices are the indices in a "flattened" batch.
|
787
797
|
# dividing it to num_draft_tokens will yield the actual batch index.
|
788
798
|
temperatures = temperatures[accepted_indices // num_draft_tokens]
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
799
|
+
if RETURN_ORIGINAL_LOGPROB:
|
800
|
+
logprobs = torch.nn.functional.log_softmax(
|
801
|
+
logits_output.next_token_logits, dim=-1
|
802
|
+
)
|
803
|
+
else:
|
804
|
+
logprobs = torch.nn.functional.log_softmax(
|
805
|
+
logits_output.next_token_logits / temperatures, dim=-1
|
806
|
+
)
|
793
807
|
batch_next_token_ids = res.verified_id
|
794
808
|
num_tokens_per_req = [accept + 1 for accept in res.accept_length_per_req_cpu]
|
795
809
|
|
@@ -806,13 +820,19 @@ class EAGLEWorker(TpModelWorker):
|
|
806
820
|
(
|
807
821
|
logits_output.next_token_top_logprobs_val,
|
808
822
|
logits_output.next_token_top_logprobs_idx,
|
809
|
-
) = get_top_logprobs(
|
823
|
+
) = get_top_logprobs(
|
824
|
+
logprobs,
|
825
|
+
top_logprobs_nums_repeat_interleaved,
|
826
|
+
)
|
810
827
|
|
811
828
|
if any(x is not None for x in token_ids_logprobs):
|
812
829
|
(
|
813
830
|
logits_output.next_token_token_ids_logprobs_val,
|
814
831
|
logits_output.next_token_token_ids_logprobs_idx,
|
815
|
-
) = get_token_ids_logprobs(
|
832
|
+
) = get_token_ids_logprobs(
|
833
|
+
logprobs,
|
834
|
+
token_ids_logprobs_repeat_interleaved,
|
835
|
+
)
|
816
836
|
|
817
837
|
logits_output.next_token_logprobs = logprobs[
|
818
838
|
torch.arange(len(batch_next_token_ids), device=batch.sampling_info.device),
|
sglang/srt/utils.py
CHANGED
@@ -2787,6 +2787,10 @@ def lru_cache_frozenset(maxsize=128):
|
|
2787
2787
|
return decorator
|
2788
2788
|
|
2789
2789
|
|
2790
|
+
def get_origin_rid(rid):
|
2791
|
+
return rid.split("_", 1)[1] if "_" in rid else rid
|
2792
|
+
|
2793
|
+
|
2790
2794
|
def apply_module_patch(target_module, target_function, wrappers):
|
2791
2795
|
original_module, original_function = parse_module_path(
|
2792
2796
|
target_module, target_function, False
|
@@ -208,6 +208,15 @@ class MockModelRunner:
|
|
208
208
|
self.kv_cache_dtype = config["kv_cache_dtype"]
|
209
209
|
self.page_size = config["page_size"]
|
210
210
|
|
211
|
+
# Server args stub - needed by attention backends
|
212
|
+
self.server_args = type(
|
213
|
+
"ServerArgs",
|
214
|
+
(),
|
215
|
+
{
|
216
|
+
"enable_dp_attention": False, # Default value for testing
|
217
|
+
},
|
218
|
+
)
|
219
|
+
|
211
220
|
# Model-config stub with MLA attributes
|
212
221
|
self.model_config = type(
|
213
222
|
"ModelConfig",
|
@@ -833,7 +842,7 @@ class TestTRTLLMMLA(CustomTestCase):
|
|
833
842
|
|
834
843
|
# Test workspace properties
|
835
844
|
self.assertEqual(metadata.workspace.device.type, "cuda")
|
836
|
-
self.assertEqual(metadata.workspace.dtype, torch.
|
845
|
+
self.assertEqual(metadata.workspace.dtype, torch.uint8)
|
837
846
|
self.assertGreater(
|
838
847
|
metadata.workspace.numel(), 0, "Workspace should have non-zero size"
|
839
848
|
)
|
@@ -993,8 +1002,8 @@ class TestTRTLLMMLA(CustomTestCase):
|
|
993
1002
|
)
|
994
1003
|
|
995
1004
|
# Verify CUDA graph buffers are allocated
|
996
|
-
self.assertIsNotNone(backend.
|
997
|
-
self.assertIsNotNone(backend.
|
1005
|
+
self.assertIsNotNone(backend.decode_cuda_graph_kv_indices)
|
1006
|
+
self.assertIsNotNone(backend.decode_cuda_graph_workspace)
|
998
1007
|
|
999
1008
|
# Test capture metadata
|
1000
1009
|
seq_lens = torch.full(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
2
|
|
3
|
-
from typing import Optional
|
3
|
+
from typing import Literal, Optional
|
4
4
|
|
5
5
|
import pytest
|
6
6
|
import torch
|
@@ -25,7 +25,7 @@ def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Ten
|
|
25
25
|
return packed_tensor.to(torch.int8)
|
26
26
|
|
27
27
|
|
28
|
-
def pack_interleave(num_experts, ref_weight, ref_scale):
|
28
|
+
def pack_interleave(num_experts, ref_weight, ref_scale, alignment=4):
|
29
29
|
n, k = ref_weight.shape[1], ref_weight.shape[2]
|
30
30
|
|
31
31
|
weight = pack_int4_values_to_int8(ref_weight.cpu()).cuda()
|
@@ -33,11 +33,16 @@ def pack_interleave(num_experts, ref_weight, ref_scale):
|
|
33
33
|
w_q = w_q.contiguous()
|
34
34
|
|
35
35
|
scale_interleaved = ref_scale.reshape(
|
36
|
-
ref_scale.shape[0],
|
36
|
+
ref_scale.shape[0],
|
37
|
+
ref_scale.shape[1],
|
38
|
+
(ref_scale.shape[2] // alignment),
|
39
|
+
alignment,
|
37
40
|
) # [E, N, K/4, 4]
|
38
41
|
scale_interleaved = scale_interleaved.permute(0, 2, 1, 3) # [E, K/4, N, 4]
|
39
42
|
scale_interleaved = scale_interleaved.reshape(
|
40
|
-
ref_scale.shape[0],
|
43
|
+
ref_scale.shape[0],
|
44
|
+
ref_scale.shape[2] // alignment,
|
45
|
+
ref_scale.shape[1] * alignment,
|
41
46
|
) # [E, K/4, N*4]
|
42
47
|
w_scale = scale_interleaved.contiguous()
|
43
48
|
|
@@ -48,12 +53,17 @@ def pack_interleave(num_experts, ref_weight, ref_scale):
|
|
48
53
|
@pytest.mark.parametrize("N", [2048])
|
49
54
|
@pytest.mark.parametrize("K", [7168])
|
50
55
|
@pytest.mark.parametrize("E", [256])
|
51
|
-
@pytest.mark.parametrize("
|
56
|
+
@pytest.mark.parametrize("tp_size", [8])
|
57
|
+
@pytest.mark.parametrize("use_ep_moe", [True, False])
|
52
58
|
@pytest.mark.parametrize("topk", [8])
|
53
59
|
@pytest.mark.parametrize("group_size", [128])
|
54
60
|
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
55
|
-
def test_cutlass_w4a8_moe(M, N, K, E,
|
56
|
-
|
61
|
+
def test_cutlass_w4a8_moe(M, N, K, E, tp_size, use_ep_moe, topk, group_size, dtype):
|
62
|
+
if use_ep_moe:
|
63
|
+
local_e = E // tp_size
|
64
|
+
else: # tp mode
|
65
|
+
local_e = E
|
66
|
+
N = N // tp_size
|
57
67
|
|
58
68
|
debug = False
|
59
69
|
if debug:
|
@@ -87,7 +97,10 @@ def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
|
|
87
97
|
)
|
88
98
|
|
89
99
|
w1_q, w1_scale = pack_interleave(local_e, ref_weight_1, scale_1)
|
90
|
-
|
100
|
+
if use_ep_moe:
|
101
|
+
w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
|
102
|
+
else:
|
103
|
+
w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2, 1)
|
91
104
|
|
92
105
|
device = "cuda"
|
93
106
|
a_strides1 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
|
@@ -265,7 +278,9 @@ def ref(
|
|
265
278
|
|
266
279
|
gate, fc1 = fc1.chunk(2, dim=-1)
|
267
280
|
fc1 = fc1 * torch.nn.functional.silu(gate)
|
268
|
-
act = (fc1 / pre_quant_scale_2.float()).to(
|
281
|
+
act = torch.clamp((fc1 / pre_quant_scale_2.float()), -448.0, 448.0).to(
|
282
|
+
torch.float8_e4m3fn
|
283
|
+
)
|
269
284
|
act = act.to(dtype)
|
270
285
|
|
271
286
|
w2 = ref_weight_2[e_idx]
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.2rc1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.2rc1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -251,18 +251,18 @@ Requires-Dist: scipy; extra == "runtime-common"
|
|
251
251
|
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
252
252
|
Requires-Dist: tiktoken; extra == "runtime-common"
|
253
253
|
Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
254
|
-
Requires-Dist: transformers==4.
|
254
|
+
Requires-Dist: transformers==4.56.0; extra == "runtime-common"
|
255
255
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
256
256
|
Requires-Dist: uvloop; extra == "runtime-common"
|
257
257
|
Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
|
258
258
|
Provides-Extra: srt
|
259
259
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
260
|
-
Requires-Dist: sgl-kernel==0.3.
|
260
|
+
Requires-Dist: sgl-kernel==0.3.8; extra == "srt"
|
261
261
|
Requires-Dist: torch==2.8.0; extra == "srt"
|
262
262
|
Requires-Dist: torchaudio==2.8.0; extra == "srt"
|
263
263
|
Requires-Dist: torchvision; extra == "srt"
|
264
264
|
Requires-Dist: cuda-python; extra == "srt"
|
265
|
-
Requires-Dist: flashinfer_python==0.
|
265
|
+
Requires-Dist: flashinfer_python==0.3.0; extra == "srt"
|
266
266
|
Provides-Extra: blackwell
|
267
267
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
268
268
|
Requires-Dist: sgl-kernel; extra == "blackwell"
|
@@ -270,7 +270,7 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
|
|
270
270
|
Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
|
271
271
|
Requires-Dist: torchvision; extra == "blackwell"
|
272
272
|
Requires-Dist: cuda-python; extra == "blackwell"
|
273
|
-
Requires-Dist: flashinfer_python==0.
|
273
|
+
Requires-Dist: flashinfer_python==0.3.0; extra == "blackwell"
|
274
274
|
Provides-Extra: srt-hip
|
275
275
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
276
276
|
Requires-Dist: torch; extra == "srt-hip"
|