sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +1 -11
- sglang/bench_serving.py +149 -1
- sglang/lang/chat_template.py +44 -0
- sglang/srt/configs/deepseekvl2.py +3 -0
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/internvl.py +696 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/model_config.py +17 -0
- sglang/srt/constrained/xgrammar_backend.py +11 -19
- sglang/srt/conversation.py +30 -3
- sglang/srt/disaggregation/decode.py +4 -1
- sglang/srt/disaggregation/mini_lb.py +74 -23
- sglang/srt/disaggregation/mooncake/conn.py +9 -18
- sglang/srt/disaggregation/nixl/conn.py +241 -71
- sglang/srt/disaggregation/utils.py +44 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
- sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
- sglang/srt/distributed/device_communicators/pynccl.py +2 -1
- sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
- sglang/srt/distributed/parallel_state.py +22 -1
- sglang/srt/entrypoints/engine.py +14 -2
- sglang/srt/entrypoints/http_server.py +28 -1
- sglang/srt/entrypoints/verl_engine.py +3 -2
- sglang/srt/hf_transformers_utils.py +20 -1
- sglang/srt/layers/attention/flashattention_backend.py +146 -50
- sglang/srt/layers/attention/flashinfer_backend.py +23 -13
- sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
- sglang/srt/layers/attention/merge_state.py +46 -0
- sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
- sglang/srt/layers/attention/vision.py +290 -163
- sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
- sglang/srt/layers/moe/ep_moe/layer.py +120 -1
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +4 -1
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
- sglang/srt/layers/quantization/deep_gemm.py +5 -0
- sglang/srt/layers/quantization/fp8.py +108 -95
- sglang/srt/layers/quantization/fp8_kernel.py +79 -60
- sglang/srt/layers/quantization/fp8_utils.py +71 -23
- sglang/srt/layers/quantization/kv_cache.py +3 -10
- sglang/srt/layers/quantization/utils.py +0 -5
- sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
- sglang/srt/lora/lora_manager.py +10 -13
- sglang/srt/managers/cache_controller.py +115 -119
- sglang/srt/managers/io_struct.py +10 -0
- sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
- sglang/srt/managers/multimodal_processors/internvl.py +232 -0
- sglang/srt/managers/schedule_batch.py +19 -1
- sglang/srt/managers/schedule_policy.py +11 -5
- sglang/srt/managers/scheduler.py +28 -13
- sglang/srt/managers/tokenizer_manager.py +24 -13
- sglang/srt/managers/tp_worker.py +9 -12
- sglang/srt/mem_cache/chunk_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +2 -2
- sglang/srt/model_executor/model_runner.py +44 -33
- sglang/srt/model_loader/loader.py +18 -11
- sglang/srt/models/clip.py +4 -4
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_nextn.py +1 -20
- sglang/srt/models/deepseek_v2.py +55 -20
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/internlm2.py +3 -0
- sglang/srt/models/internvl.py +670 -0
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +53 -7
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/phi3_small.py +16 -2
- sglang/srt/models/qwen2_5_vl.py +8 -4
- sglang/srt/models/qwen2_vl.py +4 -4
- sglang/srt/models/xiaomi_mimo.py +171 -0
- sglang/srt/openai_api/adapter.py +24 -40
- sglang/srt/openai_api/protocol.py +28 -16
- sglang/srt/reasoning_parser.py +2 -2
- sglang/srt/sampling/sampling_batch_info.py +54 -2
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +30 -6
- sglang/srt/utils.py +35 -1
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deepep_utils.py +219 -0
- sglang/test/test_utils.py +3 -1
- sglang/version.py +1 -1
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +14 -6
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +90 -80
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0
@@ -30,8 +30,13 @@ class SamplingBatchInfo:
|
|
30
30
|
# Whether any request needs min_p sampling
|
31
31
|
need_min_p_sampling: bool
|
32
32
|
|
33
|
+
# Use thinking_budget to truncate thinking
|
34
|
+
num_thinking_tokens: Optional[torch.Tensor] = None
|
35
|
+
think_end_ids: Optional[torch.Tensor] = None
|
36
|
+
thinking_budgets: Optional[torch.Tensor] = None
|
37
|
+
|
33
38
|
# Masking tensors for grammar-guided structured outputs
|
34
|
-
vocab_size: int
|
39
|
+
vocab_size: int = 0
|
35
40
|
grammars: Optional[List] = None
|
36
41
|
vocab_mask: Optional[torch.Tensor] = None
|
37
42
|
apply_mask_func: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
|
@@ -76,7 +81,22 @@ class SamplingBatchInfo:
|
|
76
81
|
min_ps = torch.tensor(
|
77
82
|
[r.sampling_params.min_p for r in reqs], dtype=torch.float
|
78
83
|
).to(device, non_blocking=True)
|
79
|
-
|
84
|
+
if any(hasattr(r.tokenizer, "think_end_id") for r in reqs):
|
85
|
+
think_end_ids = torch.tensor(
|
86
|
+
[getattr(r.tokenizer, "think_end_id", -1) for r in reqs],
|
87
|
+
dtype=torch.int64,
|
88
|
+
).to(device, non_blocking=True)
|
89
|
+
num_thinking_tokens = torch.tensor([0 for _ in reqs], dtype=torch.int64).to(
|
90
|
+
device, non_blocking=True
|
91
|
+
)
|
92
|
+
thinking_budgets = torch.tensor(
|
93
|
+
[r.sampling_params.thinking_budget or -1 for r in reqs],
|
94
|
+
dtype=torch.int64,
|
95
|
+
).to(device, non_blocking=True)
|
96
|
+
else:
|
97
|
+
think_end_ids = None
|
98
|
+
num_thinking_tokens = None
|
99
|
+
thinking_budgets = None
|
80
100
|
# Check if any request has custom logit processor
|
81
101
|
has_custom_logit_processor = (
|
82
102
|
batch.enable_custom_logit_processor # check the flag first.
|
@@ -132,6 +152,9 @@ class SamplingBatchInfo:
|
|
132
152
|
top_ps=top_ps,
|
133
153
|
top_ks=top_ks,
|
134
154
|
min_ps=min_ps,
|
155
|
+
think_end_ids=think_end_ids,
|
156
|
+
num_thinking_tokens=num_thinking_tokens,
|
157
|
+
thinking_budgets=thinking_budgets,
|
135
158
|
is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
|
136
159
|
need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
|
137
160
|
vocab_size=vocab_size,
|
@@ -146,6 +169,35 @@ class SamplingBatchInfo:
|
|
146
169
|
def __len__(self):
|
147
170
|
return len(self.temperatures)
|
148
171
|
|
172
|
+
def apply_thinking_budgets(self, next_token_logits: torch.Tensor):
|
173
|
+
has_budget = self.thinking_budgets > 0
|
174
|
+
if not has_budget.any():
|
175
|
+
return
|
176
|
+
torch.where(
|
177
|
+
has_budget,
|
178
|
+
self.num_thinking_tokens + 1,
|
179
|
+
self.num_thinking_tokens,
|
180
|
+
out=self.num_thinking_tokens,
|
181
|
+
)
|
182
|
+
should_stop = has_budget & (
|
183
|
+
self.num_thinking_tokens - 1 > self.thinking_budgets
|
184
|
+
)
|
185
|
+
next_token_logits.masked_fill_(should_stop.unsqueeze(0), float("-inf"))
|
186
|
+
batch_indices = torch.nonzero(should_stop, as_tuple=True)[0]
|
187
|
+
if len(batch_indices) > 0:
|
188
|
+
end_token_indices = self.think_end_ids[batch_indices]
|
189
|
+
next_token_logits[batch_indices, end_token_indices] = 0.0
|
190
|
+
|
191
|
+
def update_thinking_budgets(self, next_token_ids: torch.Tensor):
|
192
|
+
if not torch.any(self.thinking_budgets > 0):
|
193
|
+
return
|
194
|
+
torch.where(
|
195
|
+
next_token_ids == self.think_end_ids,
|
196
|
+
torch.tensor(-1, device=self.thinking_budgets.device),
|
197
|
+
self.thinking_budgets,
|
198
|
+
out=self.thinking_budgets,
|
199
|
+
)
|
200
|
+
|
149
201
|
def update_regex_vocab_mask(self):
|
150
202
|
if not self.grammars:
|
151
203
|
self.vocab_mask = None
|
@@ -30,6 +30,7 @@ class SamplingParams:
|
|
30
30
|
def __init__(
|
31
31
|
self,
|
32
32
|
max_new_tokens: int = 128,
|
33
|
+
thinking_budget: Optional[int] = None,
|
33
34
|
stop: Optional[Union[str, List[str]]] = None,
|
34
35
|
stop_token_ids: Optional[List[int]] = None,
|
35
36
|
temperature: float = 1.0,
|
@@ -57,6 +58,7 @@ class SamplingParams:
|
|
57
58
|
self.stop_token_ids = set(stop_token_ids)
|
58
59
|
else:
|
59
60
|
self.stop_token_ids = None
|
61
|
+
self.thinking_budget = thinking_budget
|
60
62
|
self.temperature = temperature
|
61
63
|
self.top_p = top_p
|
62
64
|
self.top_k = top_k
|
sglang/srt/server_args.py
CHANGED
@@ -187,6 +187,7 @@ class ServerArgs:
|
|
187
187
|
n_share_experts_fusion: int = 0
|
188
188
|
disable_chunked_prefix_cache: bool = False
|
189
189
|
disable_fast_image_processor: bool = False
|
190
|
+
mm_attention_backend: Optional[str] = None
|
190
191
|
|
191
192
|
# Debug tensor dumps
|
192
193
|
debug_tensor_dump_output_folder: Optional[str] = None
|
@@ -198,6 +199,7 @@ class ServerArgs:
|
|
198
199
|
disaggregation_bootstrap_port: int = 8998
|
199
200
|
disaggregation_transfer_backend: str = "mooncake"
|
200
201
|
disaggregation_ib_device: Optional[str] = None
|
202
|
+
pdlb_url: Optional[str] = None
|
201
203
|
|
202
204
|
def __post_init__(self):
|
203
205
|
# Expert parallelism
|
@@ -322,6 +324,9 @@ class ServerArgs:
|
|
322
324
|
assert (
|
323
325
|
not self.enable_dp_attention
|
324
326
|
), "DeepEP MoE `auto` mode is not supported with DP Attention."
|
327
|
+
if self.deepep_mode == "normal":
|
328
|
+
logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
|
329
|
+
self.disable_cuda_graph = True
|
325
330
|
self.ep_size = self.tp_size
|
326
331
|
self.enable_sp_layernorm = (
|
327
332
|
self.dp_size < self.tp_size if self.enable_dp_attention else True
|
@@ -347,10 +352,13 @@ class ServerArgs:
|
|
347
352
|
model_arch = get_model_arch(self)
|
348
353
|
|
349
354
|
# Auto set draft_model_path DeepSeek-V3/R1
|
350
|
-
if
|
351
|
-
|
352
|
-
|
353
|
-
|
355
|
+
if model_arch == "DeepseekV3ForCausalLM":
|
356
|
+
if self.speculative_draft_model_path is None:
|
357
|
+
self.speculative_draft_model_path = self.model_path
|
358
|
+
else:
|
359
|
+
logger.warning(
|
360
|
+
"DeepSeek MTP does not require setting speculative_draft_model_path."
|
361
|
+
)
|
354
362
|
|
355
363
|
# Auto choose parameters
|
356
364
|
if self.speculative_num_steps is None:
|
@@ -551,7 +559,7 @@ class ServerArgs:
|
|
551
559
|
"--device",
|
552
560
|
type=str,
|
553
561
|
default=ServerArgs.device,
|
554
|
-
help="The device to use ('cuda', 'xpu', 'hpu', 'cpu'). Defaults to auto-detection if not specified.",
|
562
|
+
help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
|
555
563
|
)
|
556
564
|
parser.add_argument(
|
557
565
|
"--served-model-name",
|
@@ -1247,7 +1255,23 @@ class ServerArgs:
|
|
1247
1255
|
"--disaggregation-ib-device",
|
1248
1256
|
type=str,
|
1249
1257
|
default=ServerArgs.disaggregation_ib_device,
|
1250
|
-
help="The
|
1258
|
+
help="The InfiniBand devices for disaggregation transfer, accepts single device (e.g., --disaggregation-ib-device mlx5_0) "
|
1259
|
+
"or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
|
1260
|
+
"Default is None, which triggers automatic device detection when mooncake backend is enabled.",
|
1261
|
+
)
|
1262
|
+
parser.add_argument(
|
1263
|
+
"--pdlb-url",
|
1264
|
+
type=str,
|
1265
|
+
default=None,
|
1266
|
+
help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
|
1267
|
+
)
|
1268
|
+
|
1269
|
+
parser.add_argument(
|
1270
|
+
"--mm-attention-backend",
|
1271
|
+
type=str,
|
1272
|
+
choices=["sdpa", "fa3", "triton_attn"],
|
1273
|
+
default=ServerArgs.mm_attention_backend,
|
1274
|
+
help="Set multimodal attention backend.",
|
1251
1275
|
)
|
1252
1276
|
|
1253
1277
|
@classmethod
|
sglang/srt/utils.py
CHANGED
@@ -145,6 +145,10 @@ def is_xpu() -> bool:
|
|
145
145
|
return hasattr(torch, "xpu") and torch.xpu.is_available()
|
146
146
|
|
147
147
|
|
148
|
+
def is_npu() -> bool:
|
149
|
+
return hasattr(torch, "npu") and torch.npu.is_available()
|
150
|
+
|
151
|
+
|
148
152
|
def is_flashinfer_available():
|
149
153
|
"""
|
150
154
|
Check whether flashinfer is available.
|
@@ -328,6 +332,16 @@ def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True
|
|
328
332
|
elif device == "cpu":
|
329
333
|
# TODO: rename the variables in the current function to be not GPU specific
|
330
334
|
free_gpu_memory = psutil.virtual_memory().available
|
335
|
+
elif device == "npu":
|
336
|
+
num_gpus = torch.npu.device_count()
|
337
|
+
assert gpu_id < num_gpus
|
338
|
+
|
339
|
+
if torch.npu.current_device() != gpu_id:
|
340
|
+
print(
|
341
|
+
f"WARNING: current device is not {gpu_id}, but {torch.npu.current_device()}, ",
|
342
|
+
"which may cause useless memory allocation for torch NPU context.",
|
343
|
+
)
|
344
|
+
free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
|
331
345
|
|
332
346
|
if distributed:
|
333
347
|
tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
|
@@ -897,7 +911,10 @@ def broadcast_pyobj(
|
|
897
911
|
src: int = 0,
|
898
912
|
force_cpu_device: bool = True,
|
899
913
|
):
|
900
|
-
"""Broadcast inputs from rank
|
914
|
+
"""Broadcast inputs from src rank to all other ranks with torch.dist backend.
|
915
|
+
The `rank` here refer to the source rank on global process group (regardless
|
916
|
+
of dist_group argument).
|
917
|
+
"""
|
901
918
|
device = torch.device(
|
902
919
|
"cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu"
|
903
920
|
)
|
@@ -1345,6 +1362,9 @@ def get_device_name(device_id: int = 0) -> str:
|
|
1345
1362
|
if hasattr(torch, "hpu") and torch.hpu.is_available():
|
1346
1363
|
return torch.hpu.get_device_name(device_id)
|
1347
1364
|
|
1365
|
+
if hasattr(torch, "npu") and torch.npu.is_available():
|
1366
|
+
return torch.npu.get_device_name(device_id)
|
1367
|
+
|
1348
1368
|
|
1349
1369
|
@lru_cache(maxsize=1)
|
1350
1370
|
def is_habana_available() -> bool:
|
@@ -1441,6 +1461,13 @@ def get_compiler_backend() -> str:
|
|
1441
1461
|
if hasattr(torch, "hpu") and torch.hpu.is_available():
|
1442
1462
|
return "hpu_backend"
|
1443
1463
|
|
1464
|
+
if hasattr(torch, "npu") and torch.npu.is_available():
|
1465
|
+
import torchair
|
1466
|
+
|
1467
|
+
config = torchair.CompilerConfig()
|
1468
|
+
npu_backend = torchair.get_npu_backend(compiler_config=config)
|
1469
|
+
return npu_backend
|
1470
|
+
|
1444
1471
|
return "inductor"
|
1445
1472
|
|
1446
1473
|
|
@@ -2069,3 +2096,10 @@ class BumpAllocator:
|
|
2069
2096
|
output = self._buffer[self._pointer : self._pointer + size]
|
2070
2097
|
self._pointer += size
|
2071
2098
|
return output
|
2099
|
+
|
2100
|
+
|
2101
|
+
def log_info_on_rank0(logger, msg):
|
2102
|
+
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
2103
|
+
|
2104
|
+
if get_tensor_model_parallel_rank() == 0:
|
2105
|
+
logger.info(msg)
|
sglang/test/test_block_fp8.py
CHANGED
@@ -7,9 +7,9 @@ import torch
|
|
7
7
|
from sglang.srt.layers.activation import SiluAndMul
|
8
8
|
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
|
9
9
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
10
|
-
per_tensor_quant_mla_deep_gemm_masked_fp8,
|
11
10
|
per_tensor_quant_mla_fp8,
|
12
11
|
per_token_group_quant_fp8,
|
12
|
+
per_token_group_quant_mla_deep_gemm_masked_fp8,
|
13
13
|
static_quant_fp8,
|
14
14
|
w8a8_block_fp8_matmul,
|
15
15
|
)
|
@@ -236,7 +236,7 @@ class TestPerTokenGroupQuantMlaDeepGemmMaskedFP8(CustomTestCase):
|
|
236
236
|
|
237
237
|
with torch.inference_mode():
|
238
238
|
ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size, 1e-12)
|
239
|
-
out, scale, _, _, _ =
|
239
|
+
out, scale, _, _, _ = per_token_group_quant_mla_deep_gemm_masked_fp8(
|
240
240
|
x, group_size
|
241
241
|
)
|
242
242
|
out = out[:, :num_tokens, :]
|
@@ -0,0 +1,219 @@
|
|
1
|
+
# Copy from deepseek-ai/DeepEP/tests/test_utils.py
|
2
|
+
|
3
|
+
import os
|
4
|
+
import sys
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import torch
|
9
|
+
import torch.distributed as dist
|
10
|
+
|
11
|
+
|
12
|
+
def init_dist(local_rank: int, num_local_ranks: int):
|
13
|
+
# NOTES: you may rewrite this function with your own cluster settings
|
14
|
+
ip = os.getenv("MASTER_ADDR", "127.0.0.1")
|
15
|
+
port = int(os.getenv("MASTER_PORT", "8361"))
|
16
|
+
num_nodes = int(os.getenv("WORLD_SIZE", 1))
|
17
|
+
node_rank = int(os.getenv("RANK", 0))
|
18
|
+
assert (num_local_ranks < 8 and num_nodes == 1) or num_local_ranks == 8
|
19
|
+
|
20
|
+
dist.init_process_group(
|
21
|
+
backend="nccl",
|
22
|
+
init_method=f"tcp://{ip}:{port}",
|
23
|
+
world_size=num_nodes * num_local_ranks,
|
24
|
+
rank=node_rank * num_local_ranks + local_rank,
|
25
|
+
)
|
26
|
+
torch.set_default_dtype(torch.bfloat16)
|
27
|
+
torch.set_default_device("cuda")
|
28
|
+
torch.cuda.set_device(local_rank)
|
29
|
+
|
30
|
+
return (
|
31
|
+
dist.get_rank(),
|
32
|
+
dist.get_world_size(),
|
33
|
+
dist.new_group(list(range(num_local_ranks * num_nodes))),
|
34
|
+
)
|
35
|
+
|
36
|
+
|
37
|
+
def calc_diff(x: torch.Tensor, y: torch.Tensor):
|
38
|
+
x, y = x.double() + 1, y.double() + 1
|
39
|
+
denominator = (x * x + y * y).sum()
|
40
|
+
sim = 2 * (x * y).sum() / denominator
|
41
|
+
return (1 - sim).item()
|
42
|
+
|
43
|
+
|
44
|
+
def per_token_cast_to_fp8(x: torch.Tensor):
|
45
|
+
assert x.dim() == 2 and x.size(1) % 128 == 0
|
46
|
+
m, n = x.shape
|
47
|
+
x_view = x.view(m, -1, 128)
|
48
|
+
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
|
49
|
+
return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(
|
50
|
+
m, n
|
51
|
+
), (x_amax / 448.0).view(m, -1)
|
52
|
+
|
53
|
+
|
54
|
+
def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
|
55
|
+
x_fp32 = x_fp8.to(torch.float32).view(x_fp8.size(0), -1, 128)
|
56
|
+
x_scales = x_scales.view(x_fp8.size(0), -1, 1)
|
57
|
+
return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)
|
58
|
+
|
59
|
+
|
60
|
+
def inplace_unique(x: torch.Tensor, num_slots: int):
|
61
|
+
assert x.dim() == 2
|
62
|
+
mask = x < 0
|
63
|
+
x_padded = x.masked_fill(mask, num_slots)
|
64
|
+
bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
|
65
|
+
bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
|
66
|
+
bin_count = bin_count[:, :num_slots]
|
67
|
+
sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
|
68
|
+
sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
|
69
|
+
sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
|
70
|
+
x[:, :].fill_(-1)
|
71
|
+
valid_len = min(num_slots, x.size(1))
|
72
|
+
x[:, :valid_len] = sorted_bin_idx[:, :valid_len]
|
73
|
+
|
74
|
+
|
75
|
+
def create_grouped_scores(
|
76
|
+
scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int
|
77
|
+
):
|
78
|
+
num_tokens, num_experts = scores.shape
|
79
|
+
scores = scores.view(num_tokens, num_groups, -1)
|
80
|
+
mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
|
81
|
+
mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
|
82
|
+
return (scores * mask).view(num_tokens, num_experts)
|
83
|
+
|
84
|
+
|
85
|
+
def bench(fn, num_warmups: int = 20, num_tests: int = 30, post_fn=None):
|
86
|
+
# Flush L2 cache with 256 MB data
|
87
|
+
torch.cuda.synchronize()
|
88
|
+
cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
|
89
|
+
|
90
|
+
# Warmup
|
91
|
+
for _ in range(num_warmups):
|
92
|
+
fn()
|
93
|
+
|
94
|
+
# Flush L2
|
95
|
+
cache.zero_()
|
96
|
+
|
97
|
+
# Testing
|
98
|
+
start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
|
99
|
+
end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
|
100
|
+
for i in range(num_tests):
|
101
|
+
# Record
|
102
|
+
start_events[i].record()
|
103
|
+
fn()
|
104
|
+
end_events[i].record()
|
105
|
+
if post_fn is not None:
|
106
|
+
post_fn()
|
107
|
+
torch.cuda.synchronize()
|
108
|
+
|
109
|
+
times = np.array(
|
110
|
+
[s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)]
|
111
|
+
)[1:]
|
112
|
+
return np.average(times), np.min(times), np.max(times)
|
113
|
+
|
114
|
+
|
115
|
+
class empty_suppress:
|
116
|
+
def __enter__(self):
|
117
|
+
return self
|
118
|
+
|
119
|
+
def __exit__(self, *_):
|
120
|
+
pass
|
121
|
+
|
122
|
+
|
123
|
+
class suppress_stdout_stderr:
|
124
|
+
def __enter__(self):
|
125
|
+
self.outnull_file = open(os.devnull, "w")
|
126
|
+
self.errnull_file = open(os.devnull, "w")
|
127
|
+
|
128
|
+
self.old_stdout_fileno_undup = sys.stdout.fileno()
|
129
|
+
self.old_stderr_fileno_undup = sys.stderr.fileno()
|
130
|
+
|
131
|
+
self.old_stdout_fileno = os.dup(sys.stdout.fileno())
|
132
|
+
self.old_stderr_fileno = os.dup(sys.stderr.fileno())
|
133
|
+
|
134
|
+
self.old_stdout = sys.stdout
|
135
|
+
self.old_stderr = sys.stderr
|
136
|
+
|
137
|
+
os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
|
138
|
+
os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
|
139
|
+
|
140
|
+
sys.stdout = self.outnull_file
|
141
|
+
sys.stderr = self.errnull_file
|
142
|
+
return self
|
143
|
+
|
144
|
+
def __exit__(self, *_):
|
145
|
+
sys.stdout = self.old_stdout
|
146
|
+
sys.stderr = self.old_stderr
|
147
|
+
|
148
|
+
os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
|
149
|
+
os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
|
150
|
+
|
151
|
+
os.close(self.old_stdout_fileno)
|
152
|
+
os.close(self.old_stderr_fileno)
|
153
|
+
|
154
|
+
self.outnull_file.close()
|
155
|
+
self.errnull_file.close()
|
156
|
+
|
157
|
+
|
158
|
+
def bench_kineto(
|
159
|
+
fn,
|
160
|
+
kernel_names,
|
161
|
+
num_tests: int = 30,
|
162
|
+
suppress_kineto_output: bool = False,
|
163
|
+
trace_path: Optional[str] = None,
|
164
|
+
barrier_comm_profiling: bool = False,
|
165
|
+
):
|
166
|
+
# Profile
|
167
|
+
suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
|
168
|
+
with suppress():
|
169
|
+
schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
|
170
|
+
with torch.profiler.profile(
|
171
|
+
activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule
|
172
|
+
) as prof:
|
173
|
+
for i in range(2):
|
174
|
+
# NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
|
175
|
+
if barrier_comm_profiling:
|
176
|
+
lhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
|
177
|
+
rhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
|
178
|
+
lhs @ rhs
|
179
|
+
dist.all_reduce(torch.ones(1, dtype=torch.float, device="cuda"))
|
180
|
+
for _ in range(num_tests):
|
181
|
+
fn()
|
182
|
+
prof.step()
|
183
|
+
|
184
|
+
# Parse the profiling table
|
185
|
+
assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
|
186
|
+
is_tupled = isinstance(kernel_names, tuple)
|
187
|
+
prof_lines = (
|
188
|
+
prof.key_averages()
|
189
|
+
.table(sort_by="cuda_time_total", max_name_column_width=100)
|
190
|
+
.split("\n")
|
191
|
+
)
|
192
|
+
kernel_names = (kernel_names,) if isinstance(kernel_names, str) else kernel_names
|
193
|
+
assert all([isinstance(name, str) for name in kernel_names])
|
194
|
+
for name in kernel_names:
|
195
|
+
assert (
|
196
|
+
sum([name in line for line in prof_lines]) == 1
|
197
|
+
), f"Errors of the kernel {name} in the profiling table"
|
198
|
+
|
199
|
+
# Save chrome traces
|
200
|
+
if trace_path is not None:
|
201
|
+
prof.export_chrome_trace(trace_path)
|
202
|
+
|
203
|
+
# Return average kernel times
|
204
|
+
units = {"ms": 1e3, "us": 1e6}
|
205
|
+
kernel_times = []
|
206
|
+
for name in kernel_names:
|
207
|
+
for line in prof_lines:
|
208
|
+
if name in line:
|
209
|
+
time_str = line.split()[-2]
|
210
|
+
for unit, scale in units.items():
|
211
|
+
if unit in time_str:
|
212
|
+
kernel_times.append(float(time_str.replace(unit, "")) / scale)
|
213
|
+
break
|
214
|
+
break
|
215
|
+
return tuple(kernel_times) if is_tupled else kernel_times[0]
|
216
|
+
|
217
|
+
|
218
|
+
def hash_tensor(t: torch.Tensor):
|
219
|
+
return t.view(torch.int64).sum().item()
|
sglang/test/test_utils.py
CHANGED
@@ -66,6 +66,7 @@ DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
|
|
66
66
|
)
|
67
67
|
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
68
68
|
DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
|
69
|
+
DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-V3-0324"
|
69
70
|
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
|
70
71
|
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
71
72
|
)
|
@@ -78,7 +79,8 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Ins
|
|
78
79
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
79
80
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
80
81
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
|
81
|
-
|
82
|
+
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
|
83
|
+
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST = "qwen2-vl"
|
82
84
|
|
83
85
|
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
|
84
86
|
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.6.
|
1
|
+
__version__ = "0.4.6.post3"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.6.
|
3
|
+
Version: 0.4.6.post3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -230,6 +230,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
|
|
230
230
|
Requires-Dist: ninja; extra == "runtime-common"
|
231
231
|
Requires-Dist: orjson; extra == "runtime-common"
|
232
232
|
Requires-Dist: packaging; extra == "runtime-common"
|
233
|
+
Requires-Dist: partial_json_parser; extra == "runtime-common"
|
233
234
|
Requires-Dist: pillow; extra == "runtime-common"
|
234
235
|
Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
|
235
236
|
Requires-Dist: psutil; extra == "runtime-common"
|
@@ -242,7 +243,7 @@ Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
|
|
242
243
|
Requires-Dist: transformers==4.51.1; extra == "runtime-common"
|
243
244
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
244
245
|
Requires-Dist: uvloop; extra == "runtime-common"
|
245
|
-
Requires-Dist: xgrammar==0.1.
|
246
|
+
Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
|
246
247
|
Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
|
247
248
|
Provides-Extra: srt
|
248
249
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
@@ -252,7 +253,6 @@ Requires-Dist: torch==2.6.0; extra == "srt"
|
|
252
253
|
Requires-Dist: torchvision==0.21.0; extra == "srt"
|
253
254
|
Requires-Dist: cuda-python; extra == "srt"
|
254
255
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
255
|
-
Requires-Dist: partial_json_parser; extra == "srt"
|
256
256
|
Requires-Dist: einops; extra == "srt"
|
257
257
|
Provides-Extra: blackwell
|
258
258
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
@@ -261,7 +261,6 @@ Requires-Dist: torch; extra == "blackwell"
|
|
261
261
|
Requires-Dist: torchvision; extra == "blackwell"
|
262
262
|
Requires-Dist: cuda-python; extra == "blackwell"
|
263
263
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
|
264
|
-
Requires-Dist: partial_json_parser; extra == "blackwell"
|
265
264
|
Requires-Dist: einops; extra == "blackwell"
|
266
265
|
Provides-Extra: srt-hip
|
267
266
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
@@ -278,6 +277,9 @@ Provides-Extra: srt-cpu
|
|
278
277
|
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
279
278
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
|
280
279
|
Requires-Dist: torch; extra == "srt-cpu"
|
280
|
+
Provides-Extra: srt-npu
|
281
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
|
282
|
+
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-npu"
|
281
283
|
Provides-Extra: openai
|
282
284
|
Requires-Dist: openai>=1.0; extra == "openai"
|
283
285
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -319,6 +321,11 @@ Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
|
|
319
321
|
Requires-Dist: sglang[openai]; extra == "all-cpu"
|
320
322
|
Requires-Dist: sglang[anthropic]; extra == "all-cpu"
|
321
323
|
Requires-Dist: sglang[litellm]; extra == "all-cpu"
|
324
|
+
Provides-Extra: all-npu
|
325
|
+
Requires-Dist: sglang[srt_npu]; extra == "all-npu"
|
326
|
+
Requires-Dist: sglang[openai]; extra == "all-npu"
|
327
|
+
Requires-Dist: sglang[anthropic]; extra == "all-npu"
|
328
|
+
Requires-Dist: sglang[litellm]; extra == "all-npu"
|
322
329
|
Provides-Extra: dev
|
323
330
|
Requires-Dist: sglang[all]; extra == "dev"
|
324
331
|
Requires-Dist: sglang[test]; extra == "dev"
|
@@ -358,6 +365,7 @@ Dynamic: license-file
|
|
358
365
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
359
366
|
|
360
367
|
## News
|
368
|
+
- [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
|
361
369
|
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
|
362
370
|
- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
|
363
371
|
- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
|
@@ -383,7 +391,7 @@ The core features include:
|
|
383
391
|
|
384
392
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
|
385
393
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
386
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral,
|
394
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
387
395
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
388
396
|
|
389
397
|
## Getting Started
|
@@ -401,7 +409,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
401
409
|
|
402
410
|
## Adoption and Sponsorship
|
403
411
|
The project has been deployed to large-scale production, generating trillions of tokens every day.
|
404
|
-
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
412
|
+
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Google Cloud, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
405
413
|
|
406
414
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
407
415
|
|