sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +17 -2
- sglang/bench_serving.py +170 -24
- sglang/srt/configs/internvl.py +4 -2
- sglang/srt/configs/janus_pro.py +1 -1
- sglang/srt/configs/model_config.py +60 -1
- sglang/srt/configs/update_config.py +119 -0
- sglang/srt/conversation.py +69 -1
- sglang/srt/disaggregation/decode.py +21 -5
- sglang/srt/disaggregation/mooncake/conn.py +35 -4
- sglang/srt/disaggregation/nixl/conn.py +6 -6
- sglang/srt/disaggregation/prefill.py +2 -2
- sglang/srt/disaggregation/utils.py +1 -1
- sglang/srt/distributed/parallel_state.py +44 -17
- sglang/srt/entrypoints/EngineBase.py +8 -0
- sglang/srt/entrypoints/engine.py +40 -6
- sglang/srt/entrypoints/http_server.py +111 -24
- sglang/srt/entrypoints/http_server_engine.py +1 -1
- sglang/srt/entrypoints/openai/protocol.py +4 -2
- sglang/srt/eplb/__init__.py +0 -0
- sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
- sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
- sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
- sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
- sglang/srt/{managers → eplb}/expert_location.py +1 -1
- sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
- sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
- sglang/srt/hf_transformers_utils.py +2 -1
- sglang/srt/layers/activation.py +2 -2
- sglang/srt/layers/amx_utils.py +86 -0
- sglang/srt/layers/attention/ascend_backend.py +219 -0
- sglang/srt/layers/attention/flashattention_backend.py +32 -9
- sglang/srt/layers/attention/tbo_backend.py +37 -9
- sglang/srt/layers/communicator.py +20 -2
- sglang/srt/layers/dp_attention.py +9 -3
- sglang/srt/layers/elementwise.py +76 -12
- sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
- sglang/srt/layers/layernorm.py +26 -0
- sglang/srt/layers/linear.py +84 -14
- sglang/srt/layers/logits_processor.py +4 -4
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
- sglang/srt/layers/moe/ep_moe/layer.py +176 -15
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
- sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
- sglang/srt/layers/moe/router.py +60 -22
- sglang/srt/layers/moe/topk.py +10 -28
- sglang/srt/layers/parameter.py +67 -7
- sglang/srt/layers/quantization/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
- sglang/srt/layers/quantization/fp8.py +72 -7
- sglang/srt/layers/quantization/fp8_kernel.py +1 -1
- sglang/srt/layers/quantization/fp8_utils.py +1 -2
- sglang/srt/layers/quantization/gptq.py +5 -1
- sglang/srt/layers/quantization/modelopt_quant.py +244 -1
- sglang/srt/layers/quantization/moe_wna16.py +1 -1
- sglang/srt/layers/quantization/quant_utils.py +166 -0
- sglang/srt/layers/quantization/w4afp8.py +264 -0
- sglang/srt/layers/quantization/w8a8_int8.py +52 -1
- sglang/srt/layers/rotary_embedding.py +2 -2
- sglang/srt/layers/vocab_parallel_embedding.py +20 -10
- sglang/srt/lora/lora.py +4 -5
- sglang/srt/lora/lora_manager.py +73 -20
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
- sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
- sglang/srt/managers/cache_controller.py +41 -195
- sglang/srt/managers/configure_logging.py +1 -1
- sglang/srt/managers/io_struct.py +58 -14
- sglang/srt/managers/mm_utils.py +77 -61
- sglang/srt/managers/multimodal_processor.py +2 -6
- sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
- sglang/srt/managers/schedule_batch.py +78 -85
- sglang/srt/managers/scheduler.py +130 -64
- sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
- sglang/srt/managers/session_controller.py +12 -3
- sglang/srt/managers/tokenizer_manager.py +314 -103
- sglang/srt/managers/tp_worker.py +13 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
- sglang/srt/mem_cache/allocator.py +290 -0
- sglang/srt/mem_cache/chunk_cache.py +34 -2
- sglang/srt/mem_cache/hiradix_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +402 -66
- sglang/srt/mem_cache/memory_pool_host.py +6 -109
- sglang/srt/mem_cache/multimodal_cache.py +3 -0
- sglang/srt/mem_cache/radix_cache.py +8 -4
- sglang/srt/model_executor/cuda_graph_runner.py +2 -1
- sglang/srt/model_executor/forward_batch_info.py +17 -4
- sglang/srt/model_executor/model_runner.py +297 -56
- sglang/srt/model_loader/loader.py +41 -0
- sglang/srt/model_loader/weight_utils.py +72 -4
- sglang/srt/models/deepseek_nextn.py +1 -3
- sglang/srt/models/deepseek_v2.py +195 -45
- sglang/srt/models/deepseek_vl2.py +3 -5
- sglang/srt/models/gemma3_causal.py +1 -2
- sglang/srt/models/gemma3n_causal.py +4 -3
- sglang/srt/models/gemma3n_mm.py +4 -20
- sglang/srt/models/hunyuan.py +1 -1
- sglang/srt/models/kimi_vl.py +1 -2
- sglang/srt/models/llama.py +10 -4
- sglang/srt/models/llama4.py +32 -45
- sglang/srt/models/llama_eagle3.py +61 -11
- sglang/srt/models/llava.py +5 -5
- sglang/srt/models/minicpmo.py +2 -2
- sglang/srt/models/mistral.py +1 -1
- sglang/srt/models/mllama4.py +402 -89
- sglang/srt/models/phi4mm.py +1 -3
- sglang/srt/models/pixtral.py +3 -7
- sglang/srt/models/qwen2.py +31 -3
- sglang/srt/models/qwen2_5_vl.py +1 -3
- sglang/srt/models/qwen2_audio.py +200 -0
- sglang/srt/models/qwen2_moe.py +32 -6
- sglang/srt/models/qwen2_vl.py +1 -4
- sglang/srt/models/qwen3.py +94 -25
- sglang/srt/models/qwen3_moe.py +68 -21
- sglang/srt/models/vila.py +3 -8
- sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
- sglang/srt/operations_strategy.py +6 -2
- sglang/srt/reasoning_parser.py +26 -0
- sglang/srt/sampling/sampling_batch_info.py +39 -1
- sglang/srt/server_args.py +84 -22
- sglang/srt/speculative/build_eagle_tree.py +57 -18
- sglang/srt/speculative/eagle_worker.py +6 -4
- sglang/srt/two_batch_overlap.py +203 -27
- sglang/srt/utils.py +343 -163
- sglang/srt/warmup.py +12 -3
- sglang/test/runners.py +10 -1
- sglang/test/test_cutlass_w4a8_moe.py +281 -0
- sglang/test/test_utils.py +15 -3
- sglang/utils.py +5 -5
- sglang/version.py +1 -1
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
- sglang/math_utils.py +0 -8
- /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
- /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
- /sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0
sglang/srt/warmup.py
CHANGED
@@ -4,6 +4,7 @@ from typing import List
|
|
4
4
|
import numpy as np
|
5
5
|
import tqdm
|
6
6
|
|
7
|
+
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
|
7
8
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
8
9
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
9
10
|
|
@@ -20,17 +21,21 @@ def warmup(name: str) -> callable:
|
|
20
21
|
return decorator
|
21
22
|
|
22
23
|
|
23
|
-
async def execute_warmups(
|
24
|
+
async def execute_warmups(
|
25
|
+
disaggregation_mode: str,
|
26
|
+
warmup_names: List[str],
|
27
|
+
tokenizer_manager: TokenizerManager,
|
28
|
+
):
|
24
29
|
for warmup_name in warmup_names:
|
25
30
|
if warmup_name not in _warmup_registry:
|
26
31
|
logger.warning(f"Could not find custom warmup {warmup_name}")
|
27
32
|
continue
|
28
33
|
logger.info(f"Running warmup {warmup_name}")
|
29
|
-
await _warmup_registry[warmup_name](tokenizer_manager)
|
34
|
+
await _warmup_registry[warmup_name](disaggregation_mode, tokenizer_manager)
|
30
35
|
|
31
36
|
|
32
37
|
@warmup("voice_chat")
|
33
|
-
async def voice_chat(tokenizer_manager: TokenizerManager):
|
38
|
+
async def voice_chat(disaggregation_mode: str, tokenizer_manager: TokenizerManager):
|
34
39
|
# this warms up the fused_moe triton kernels and caches them
|
35
40
|
# if we don't do this we break real time inference for voice chat
|
36
41
|
for i in tqdm.trange(1, 512):
|
@@ -44,4 +49,8 @@ async def voice_chat(tokenizer_manager: TokenizerManager):
|
|
44
49
|
"min_p": 0.0,
|
45
50
|
},
|
46
51
|
)
|
52
|
+
if disaggregation_mode != "null":
|
53
|
+
generate_req_input.bootstrap_room = 0
|
54
|
+
generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST
|
55
|
+
|
47
56
|
await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
|
sglang/test/runners.py
CHANGED
@@ -503,6 +503,8 @@ class SRTRunner:
|
|
503
503
|
disable_overlap_schedule: bool = False,
|
504
504
|
disable_custom_all_reduce: bool = False,
|
505
505
|
torchao_config: Optional[str] = None,
|
506
|
+
cuda_graph_max_bs: int = 4,
|
507
|
+
sleep_on_idle=False,
|
506
508
|
):
|
507
509
|
self.model_type = model_type
|
508
510
|
self.is_generation = model_type == "generation"
|
@@ -538,8 +540,9 @@ class SRTRunner:
|
|
538
540
|
tokenizer_path=tokenizer_path,
|
539
541
|
enable_ep_moe=enable_ep_moe,
|
540
542
|
disable_overlap_schedule=disable_overlap_schedule,
|
541
|
-
cuda_graph_max_bs=
|
543
|
+
cuda_graph_max_bs=cuda_graph_max_bs,
|
542
544
|
disable_custom_all_reduce=disable_custom_all_reduce,
|
545
|
+
sleep_on_idle=sleep_on_idle,
|
543
546
|
**spec_kwargs,
|
544
547
|
)
|
545
548
|
|
@@ -550,6 +553,12 @@ class SRTRunner:
|
|
550
553
|
else:
|
551
554
|
self.tokenizer = None
|
552
555
|
|
556
|
+
def load_lora_adapter(self, lora_name: str, lora_path: str):
|
557
|
+
return self.engine.load_lora_adapter(lora_name, lora_path)
|
558
|
+
|
559
|
+
def unload_lora_adapter(self, lora_name: str):
|
560
|
+
return self.engine.unload_lora_adapter(lora_name)
|
561
|
+
|
553
562
|
def forward(
|
554
563
|
self,
|
555
564
|
prompts: Union[
|
@@ -0,0 +1,281 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
import pytest
|
6
|
+
import torch
|
7
|
+
|
8
|
+
from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
|
9
|
+
from sglang.srt.layers.moe.topk import select_experts
|
10
|
+
|
11
|
+
|
12
|
+
def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Tensor:
|
13
|
+
if int4_values_interleaved.shape[-1] % 2 != 0:
|
14
|
+
raise ValueError(
|
15
|
+
"the last dim size of int4_values_interleaved tensor must be even."
|
16
|
+
)
|
17
|
+
|
18
|
+
input_tensor_int8 = int4_values_interleaved.to(torch.int8)
|
19
|
+
|
20
|
+
low_nibbles = input_tensor_int8[..., 0::2]
|
21
|
+
high_nibbles = input_tensor_int8[..., 1::2]
|
22
|
+
|
23
|
+
packed_tensor = (high_nibbles << 4) | (low_nibbles & 0x0F)
|
24
|
+
|
25
|
+
return packed_tensor.to(torch.int8)
|
26
|
+
|
27
|
+
|
28
|
+
def pack_interleave(num_experts, ref_weight, ref_scale):
|
29
|
+
n, k = ref_weight.shape[1], ref_weight.shape[2]
|
30
|
+
|
31
|
+
weight = pack_int4_values_to_int8(ref_weight.cpu()).cuda()
|
32
|
+
w_q = weight.view((num_experts, n, k // 2)).view(torch.int8)
|
33
|
+
w_q = w_q.contiguous()
|
34
|
+
|
35
|
+
scale_interleaved = ref_scale.reshape(
|
36
|
+
ref_scale.shape[0], ref_scale.shape[1], (ref_scale.shape[2] // 4), 4
|
37
|
+
) # [E, N, K/4, 4]
|
38
|
+
scale_interleaved = scale_interleaved.permute(0, 2, 1, 3) # [E, K/4, N, 4]
|
39
|
+
scale_interleaved = scale_interleaved.reshape(
|
40
|
+
ref_scale.shape[0], ref_scale.shape[2] // 4, ref_scale.shape[1] * 4
|
41
|
+
) # [E, K/4, N*4]
|
42
|
+
w_scale = scale_interleaved.contiguous()
|
43
|
+
|
44
|
+
return w_q, w_scale
|
45
|
+
|
46
|
+
|
47
|
+
@pytest.mark.parametrize("M", [1, 2, 4, 8, 16])
|
48
|
+
@pytest.mark.parametrize("N", [2048])
|
49
|
+
@pytest.mark.parametrize("K", [7168])
|
50
|
+
@pytest.mark.parametrize("E", [256])
|
51
|
+
@pytest.mark.parametrize("ep_size", [8])
|
52
|
+
@pytest.mark.parametrize("topk", [8])
|
53
|
+
@pytest.mark.parametrize("group_size", [128])
|
54
|
+
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
55
|
+
def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
|
56
|
+
local_e = E // ep_size
|
57
|
+
|
58
|
+
debug = False
|
59
|
+
if debug:
|
60
|
+
a = torch.ones((M, K), dtype=dtype, device="cuda") * 0.001
|
61
|
+
ref_weight_1 = torch.ones((local_e, N * 2, K), dtype=torch.int8, device="cuda")
|
62
|
+
ref_weight_2 = torch.ones((local_e, K, N), dtype=torch.int8, device="cuda")
|
63
|
+
a1_scale = torch.ones(1, dtype=torch.float32, device="cuda")
|
64
|
+
a2_scale = torch.ones(1, dtype=torch.float32, device="cuda")
|
65
|
+
scale_1 = torch.ones(
|
66
|
+
(local_e, N * 2, K // group_size), dtype=dtype, device="cuda"
|
67
|
+
)
|
68
|
+
scale_2 = torch.ones((local_e, K, N // group_size), dtype=dtype, device="cuda")
|
69
|
+
else:
|
70
|
+
a = torch.randn(M, K, dtype=dtype, device="cuda")
|
71
|
+
ref_weight_1 = torch.randint(
|
72
|
+
-8, 8, (local_e, N * 2, K), dtype=torch.int8, device="cuda"
|
73
|
+
)
|
74
|
+
ref_weight_2 = torch.randint(
|
75
|
+
-8, 8, (local_e, K, N), dtype=torch.int8, device="cuda"
|
76
|
+
)
|
77
|
+
affine_coeff = 0.005
|
78
|
+
a1_scale = torch.randn(1, dtype=torch.float32, device="cuda")
|
79
|
+
a2_scale = torch.randn(1, dtype=torch.float32, device="cuda")
|
80
|
+
scale_1 = (
|
81
|
+
torch.randn(local_e, N * 2, K // group_size, dtype=dtype, device="cuda")
|
82
|
+
* affine_coeff
|
83
|
+
)
|
84
|
+
scale_2 = (
|
85
|
+
torch.randn(local_e, K, N // group_size, dtype=dtype, device="cuda")
|
86
|
+
* affine_coeff
|
87
|
+
)
|
88
|
+
|
89
|
+
w1_q, w1_scale = pack_interleave(local_e, ref_weight_1, scale_1)
|
90
|
+
w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
|
91
|
+
|
92
|
+
device = "cuda"
|
93
|
+
a_strides1 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
|
94
|
+
c_strides1 = torch.full((local_e, 3), 2 * N, device=device, dtype=torch.int64)
|
95
|
+
a_strides2 = torch.full((local_e, 3), N, device=device, dtype=torch.int64)
|
96
|
+
c_strides2 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
|
97
|
+
b_strides1 = a_strides1
|
98
|
+
s_strides13 = c_strides1
|
99
|
+
b_strides2 = a_strides2
|
100
|
+
s_strides2 = c_strides2
|
101
|
+
|
102
|
+
score = torch.randn((M, E), dtype=dtype, device=device)
|
103
|
+
topk_weights, topk_ids = select_experts(
|
104
|
+
hidden_states=a,
|
105
|
+
router_logits=score,
|
106
|
+
top_k=topk,
|
107
|
+
use_grouped_topk=False,
|
108
|
+
renormalize=False,
|
109
|
+
)
|
110
|
+
expert_map = torch.arange(E, dtype=torch.int32, device=device)
|
111
|
+
expert_map[local_e:] = E
|
112
|
+
|
113
|
+
output = cutlass_moe(
|
114
|
+
a,
|
115
|
+
w1_q,
|
116
|
+
w2_q,
|
117
|
+
w1_scale,
|
118
|
+
w2_scale,
|
119
|
+
topk_weights,
|
120
|
+
topk_ids,
|
121
|
+
a_strides1,
|
122
|
+
b_strides1,
|
123
|
+
c_strides1,
|
124
|
+
a_strides2,
|
125
|
+
b_strides2,
|
126
|
+
c_strides2,
|
127
|
+
s_strides13,
|
128
|
+
s_strides2,
|
129
|
+
0,
|
130
|
+
local_e - 1,
|
131
|
+
E,
|
132
|
+
a1_scale,
|
133
|
+
a2_scale,
|
134
|
+
expert_map,
|
135
|
+
)
|
136
|
+
|
137
|
+
ref_output = ref(
|
138
|
+
a,
|
139
|
+
local_e,
|
140
|
+
topk_weights,
|
141
|
+
topk_ids,
|
142
|
+
ref_weight_1,
|
143
|
+
ref_weight_2,
|
144
|
+
scale_1,
|
145
|
+
scale_2,
|
146
|
+
has_pre_quant=True,
|
147
|
+
has_alpha=True,
|
148
|
+
pre_quant_scale_1=a1_scale,
|
149
|
+
pre_quant_scale_2=a2_scale,
|
150
|
+
alpha_1=a1_scale,
|
151
|
+
alpha_2=a2_scale,
|
152
|
+
)
|
153
|
+
|
154
|
+
# compare
|
155
|
+
torch.cuda.synchronize()
|
156
|
+
|
157
|
+
# compare final output
|
158
|
+
torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
|
159
|
+
print("SUCCESS: Final output tensors are close.")
|
160
|
+
|
161
|
+
|
162
|
+
def cutlass_moe(
|
163
|
+
a: torch.Tensor,
|
164
|
+
w1_q: torch.Tensor,
|
165
|
+
w2_q: torch.Tensor,
|
166
|
+
w1_scale: torch.Tensor,
|
167
|
+
w2_scale: torch.Tensor,
|
168
|
+
topk_weights: torch.Tensor,
|
169
|
+
topk_ids_: torch.Tensor,
|
170
|
+
a_strides1: torch.Tensor,
|
171
|
+
b_strides1: torch.Tensor,
|
172
|
+
c_strides1: torch.Tensor,
|
173
|
+
a_strides2: torch.Tensor,
|
174
|
+
b_strides2: torch.Tensor,
|
175
|
+
c_strides2: torch.Tensor,
|
176
|
+
s_strides13: torch.Tensor,
|
177
|
+
s_strides2: torch.Tensor,
|
178
|
+
start_expert_id: int,
|
179
|
+
end_expert_id: int,
|
180
|
+
E: int,
|
181
|
+
a1_scale: Optional[torch.Tensor] = None,
|
182
|
+
a2_scale: Optional[torch.Tensor] = None,
|
183
|
+
expert_map: Optional[torch.Tensor] = None,
|
184
|
+
apply_router_weight_on_input: bool = False,
|
185
|
+
):
|
186
|
+
local_topk_ids = topk_ids_
|
187
|
+
local_topk_ids = torch.where(expert_map[topk_ids_] != E, expert_map[topk_ids_], E)
|
188
|
+
device = a.device
|
189
|
+
|
190
|
+
local_num_experts = end_expert_id - start_expert_id + 1
|
191
|
+
expert_offsets = torch.empty(
|
192
|
+
(local_num_experts + 1), dtype=torch.int32, device=device
|
193
|
+
)
|
194
|
+
problem_sizes1 = torch.empty(
|
195
|
+
(local_num_experts, 3), dtype=torch.int32, device=device
|
196
|
+
)
|
197
|
+
problem_sizes2 = torch.empty(
|
198
|
+
(local_num_experts, 3), dtype=torch.int32, device=device
|
199
|
+
)
|
200
|
+
return cutlass_w4a8_moe(
|
201
|
+
start_expert_id,
|
202
|
+
end_expert_id,
|
203
|
+
E,
|
204
|
+
a,
|
205
|
+
w1_q,
|
206
|
+
w2_q,
|
207
|
+
w1_scale,
|
208
|
+
w2_scale,
|
209
|
+
topk_weights,
|
210
|
+
topk_ids_,
|
211
|
+
local_topk_ids,
|
212
|
+
a_strides1,
|
213
|
+
b_strides1,
|
214
|
+
c_strides1,
|
215
|
+
a_strides2,
|
216
|
+
b_strides2,
|
217
|
+
c_strides2,
|
218
|
+
s_strides13,
|
219
|
+
s_strides2,
|
220
|
+
expert_offsets,
|
221
|
+
problem_sizes1,
|
222
|
+
problem_sizes2,
|
223
|
+
a1_scale,
|
224
|
+
a2_scale,
|
225
|
+
apply_router_weight_on_input,
|
226
|
+
)
|
227
|
+
|
228
|
+
|
229
|
+
def ref(
|
230
|
+
x: torch.Tensor,
|
231
|
+
num_experts: int,
|
232
|
+
topk_weights: torch.Tensor,
|
233
|
+
topk_ids: torch.Tensor,
|
234
|
+
ref_weight_1: torch.Tensor,
|
235
|
+
ref_weight_2: torch.Tensor,
|
236
|
+
ref_weight_scale_1: torch.Tensor,
|
237
|
+
ref_weight_scale_2: torch.Tensor,
|
238
|
+
has_pre_quant: bool = False,
|
239
|
+
has_alpha: bool = False,
|
240
|
+
pre_quant_scale_1: Optional[torch.Tensor] = None,
|
241
|
+
pre_quant_scale_2: Optional[torch.Tensor] = None,
|
242
|
+
alpha_1: Optional[torch.Tensor] = None,
|
243
|
+
alpha_2: Optional[torch.Tensor] = None,
|
244
|
+
):
|
245
|
+
results = torch.zeros_like(x)
|
246
|
+
dtype = x.dtype
|
247
|
+
for e_idx in range(num_experts):
|
248
|
+
mask = topk_ids == e_idx
|
249
|
+
activated_tokens = mask.sum(1).bool()
|
250
|
+
act = x[activated_tokens, :]
|
251
|
+
if act.shape[0] == 0:
|
252
|
+
continue
|
253
|
+
final_scale = (topk_weights * mask).sum(1)[activated_tokens].unsqueeze(1)
|
254
|
+
|
255
|
+
act = (
|
256
|
+
torch.clamp((act / pre_quant_scale_1.float()), -448.0, 448.0)
|
257
|
+
.to(torch.float8_e4m3fn)
|
258
|
+
.to(dtype)
|
259
|
+
)
|
260
|
+
w3_w1 = ref_weight_1[e_idx]
|
261
|
+
ref_w_scale_repeat = (
|
262
|
+
ref_weight_scale_1[e_idx].repeat_interleave(128, dim=1).to(float)
|
263
|
+
)
|
264
|
+
w3_w1 = (w3_w1.to(float) * ref_w_scale_repeat).to(dtype)
|
265
|
+
fc1 = ((torch.matmul(act, w3_w1.T)) * alpha_1).to(torch.float16)
|
266
|
+
|
267
|
+
gate, fc1 = fc1.chunk(2, dim=-1)
|
268
|
+
fc1 = fc1 * torch.nn.functional.silu(gate)
|
269
|
+
act = (fc1 / pre_quant_scale_2.float()).to(torch.float8_e4m3fn)
|
270
|
+
act = act.to(dtype)
|
271
|
+
|
272
|
+
w2 = ref_weight_2[e_idx]
|
273
|
+
ref_w_scale_repeat = (
|
274
|
+
ref_weight_scale_2[e_idx].repeat_interleave(128, dim=1).to(float)
|
275
|
+
)
|
276
|
+
w2 = (w2.to(float) * ref_w_scale_repeat).to(dtype)
|
277
|
+
fc2 = (torch.matmul(act, w2.T) * alpha_2).to(torch.float16)
|
278
|
+
|
279
|
+
results[activated_tokens, :] += (fc2 * final_scale).to(results.dtype)
|
280
|
+
|
281
|
+
return results
|
sglang/test/test_utils.py
CHANGED
@@ -5,6 +5,7 @@ import copy
|
|
5
5
|
import logging
|
6
6
|
import os
|
7
7
|
import random
|
8
|
+
import re
|
8
9
|
import subprocess
|
9
10
|
import threading
|
10
11
|
import time
|
@@ -840,12 +841,23 @@ def run_bench_one_batch(model, other_args):
|
|
840
841
|
print(f"Output: {output}", flush=True)
|
841
842
|
print(f"Error: {error}", flush=True)
|
842
843
|
|
843
|
-
|
844
|
-
|
844
|
+
# Return prefill_latency, decode_throughput, decode_latency
|
845
|
+
prefill_line = output.split("\n")[-9]
|
846
|
+
decode_line = output.split("\n")[-3]
|
847
|
+
pattern = (
|
848
|
+
r"latency: (?P<latency>\d+\.\d+).*?throughput:\s*(?P<throughput>\d+\.\d+)"
|
849
|
+
)
|
850
|
+
match = re.search(pattern, prefill_line)
|
851
|
+
if match:
|
852
|
+
prefill_latency = float(match.group("latency"))
|
853
|
+
match = re.search(pattern, decode_line)
|
854
|
+
if match:
|
855
|
+
decode_latency = float(match.group("latency"))
|
856
|
+
decode_throughput = float(match.group("throughput"))
|
845
857
|
finally:
|
846
858
|
kill_process_tree(process.pid)
|
847
859
|
|
848
|
-
return
|
860
|
+
return prefill_latency, decode_throughput, decode_latency
|
849
861
|
|
850
862
|
|
851
863
|
def run_bench_offline_throughput(model, other_args):
|
sglang/utils.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
"""Common utilities"""
|
2
2
|
|
3
|
-
import base64
|
4
3
|
import importlib
|
5
4
|
import json
|
6
5
|
import logging
|
@@ -20,6 +19,7 @@ from json import dumps
|
|
20
19
|
from typing import Any, Callable, List, Optional, Tuple, Type, Union
|
21
20
|
|
22
21
|
import numpy as np
|
22
|
+
import pybase64
|
23
23
|
import requests
|
24
24
|
from IPython.display import HTML, display
|
25
25
|
from pydantic import BaseModel
|
@@ -148,15 +148,15 @@ def encode_image_base64(image_path: Union[str, bytes]):
|
|
148
148
|
if isinstance(image_path, str):
|
149
149
|
with open(image_path, "rb") as image_file:
|
150
150
|
data = image_file.read()
|
151
|
-
return
|
151
|
+
return pybase64.b64encode(data).decode("utf-8")
|
152
152
|
elif isinstance(image_path, bytes):
|
153
|
-
return
|
153
|
+
return pybase64.b64encode(image_path).decode("utf-8")
|
154
154
|
else:
|
155
155
|
# image_path is PIL.WebPImagePlugin.WebPImageFile
|
156
156
|
image = image_path
|
157
157
|
buffered = BytesIO()
|
158
158
|
image.save(buffered, format="PNG")
|
159
|
-
return
|
159
|
+
return pybase64.b64encode(buffered.getvalue()).decode("utf-8")
|
160
160
|
|
161
161
|
|
162
162
|
def encode_frame(frame):
|
@@ -223,7 +223,7 @@ def encode_video_base64(video_path: str, num_frames: int = 16):
|
|
223
223
|
video_bytes = b"".join(encoded_frames)
|
224
224
|
|
225
225
|
# Encode the concatenated bytes to base64
|
226
|
-
video_base64 = "video:" +
|
226
|
+
video_base64 = "video:" + pybase64.b64encode(video_bytes).decode("utf-8")
|
227
227
|
|
228
228
|
return video_base64
|
229
229
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.9.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.9.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -219,6 +219,7 @@ Requires-Dist: IPython
|
|
219
219
|
Requires-Dist: setproctitle
|
220
220
|
Provides-Extra: runtime-common
|
221
221
|
Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
|
222
|
+
Requires-Dist: build; extra == "runtime-common"
|
222
223
|
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
223
224
|
Requires-Dist: datasets; extra == "runtime-common"
|
224
225
|
Requires-Dist: fastapi; extra == "runtime-common"
|
@@ -238,24 +239,26 @@ Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
|
|
238
239
|
Requires-Dist: psutil; extra == "runtime-common"
|
239
240
|
Requires-Dist: pydantic; extra == "runtime-common"
|
240
241
|
Requires-Dist: pynvml; extra == "runtime-common"
|
242
|
+
Requires-Dist: pybase64; extra == "runtime-common"
|
241
243
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
242
244
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
243
245
|
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
244
246
|
Requires-Dist: scipy; extra == "runtime-common"
|
245
247
|
Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
246
|
-
Requires-Dist: transformers==4.
|
248
|
+
Requires-Dist: transformers==4.53.0; extra == "runtime-common"
|
249
|
+
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
247
250
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
248
251
|
Requires-Dist: uvloop; extra == "runtime-common"
|
249
|
-
Requires-Dist: xgrammar==0.1.
|
252
|
+
Requires-Dist: xgrammar==0.1.20; extra == "runtime-common"
|
250
253
|
Provides-Extra: srt
|
251
254
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
252
|
-
Requires-Dist: sgl-kernel==0.
|
255
|
+
Requires-Dist: sgl-kernel==0.2.4; extra == "srt"
|
253
256
|
Requires-Dist: torch==2.7.1; extra == "srt"
|
254
257
|
Requires-Dist: torchaudio==2.7.1; extra == "srt"
|
255
258
|
Requires-Dist: torchvision==0.22.1; extra == "srt"
|
256
259
|
Requires-Dist: cuda-python; extra == "srt"
|
257
260
|
Requires-Dist: einops; extra == "srt"
|
258
|
-
Requires-Dist: flashinfer_python==0.2.
|
261
|
+
Requires-Dist: flashinfer_python==0.2.7.post1; extra == "srt"
|
259
262
|
Provides-Extra: blackwell
|
260
263
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
261
264
|
Requires-Dist: sgl-kernel; extra == "blackwell"
|
@@ -264,7 +267,7 @@ Requires-Dist: torchaudio==2.7.1; extra == "blackwell"
|
|
264
267
|
Requires-Dist: torchvision==0.22.1; extra == "blackwell"
|
265
268
|
Requires-Dist: cuda-python; extra == "blackwell"
|
266
269
|
Requires-Dist: einops; extra == "blackwell"
|
267
|
-
Requires-Dist: flashinfer_python==0.2.
|
270
|
+
Requires-Dist: flashinfer_python==0.2.7.post1; extra == "blackwell"
|
268
271
|
Provides-Extra: srt-hip
|
269
272
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
270
273
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -295,7 +298,6 @@ Requires-Dist: jsonlines; extra == "test"
|
|
295
298
|
Requires-Dist: matplotlib; extra == "test"
|
296
299
|
Requires-Dist: pandas; extra == "test"
|
297
300
|
Requires-Dist: peft; extra == "test"
|
298
|
-
Requires-Dist: timm; extra == "test"
|
299
301
|
Requires-Dist: sentence_transformers; extra == "test"
|
300
302
|
Provides-Extra: all
|
301
303
|
Requires-Dist: sglang[srt]; extra == "all"
|
@@ -373,6 +375,8 @@ Dynamic: license-file
|
|
373
375
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
374
376
|
|
375
377
|
## News
|
378
|
+
- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
|
379
|
+
- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|
376
380
|
- [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
|
377
381
|
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
|
378
382
|
- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
|
@@ -416,7 +420,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
416
420
|
[Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
|
417
421
|
|
418
422
|
## Adoption and Sponsorship
|
419
|
-
SGLang has been deployed at large scale, generating trillions of tokens in production
|
423
|
+
SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.
|
420
424
|
|
421
425
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
422
426
|
|